Replace theano.tensor alias T with tt in theano.tensor sub-package

Indirect references to theano.tensor imports at module level were also converted to direct references in quite a few cases.

Replace theano.tensor alias T with tt in theano.tensor sub-package
c51f5936 · Brandon T. Willard · 2cb3a154 · c51f5936 · c51f5936 · c51f5936
--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
+import theano.tensor.basic as tt
 from theano import config
 from theano.gof.params_type import ParamsType
 from theano.scalar import bool as bool_t
@@ -6,7 +8,6 @@ from theano.tensor.blas import ldflags, blas_header_text, blas_header_version
 from theano.tensor.blas import blas_optdb, optdb, local_optimizer
 from theano.tensor.blas import Ger, ger, ger_destructive
 from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace
-from theano.tensor import basic as T
 class BaseBLAS(object):
@@ -706,10 +707,10 @@ def make_c_gemv_destructive(node):
        dest = inputs[0]
        if (
            dest.owner
-            and isinstance(dest.owner.op, T.AllocEmpty)
+            and isinstance(dest.owner.op, tt.AllocEmpty)
            and len(dest.clients) > 1
        ):
-            inputs[0] = T.AllocEmpty(dest.dtype)(*dest.owner.inputs)
+            inputs[0] = tt.AllocEmpty(dest.dtype)(*dest.owner.inputs)
        return [cgemv_inplace(*inputs)]

--- a/theano/tensor/fft.py
+++ b/theano/tensor/fft.py
 import numpy as np
+import theano.tensor as tt
 from theano import gof
-import theano.tensor as T
 from theano.gradient import DisconnectedType
@@ -10,10 +12,10 @@ class RFFTOp(gof.Op):
    def output_type(self, inp):
        # add extra dim for real/imag
-        return T.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim + 1))
+        return tt.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim + 1))
    def make_node(self, a, s=None):
-        a = T.as_tensor_variable(a)
+        a = tt.as_tensor_variable(a)
        if a.ndim < 2:
            raise TypeError(
                "%s: input must have dimension > 2, with first dimension batches"
@@ -22,10 +24,10 @@ class RFFTOp(gof.Op):
        if s is None:
            s = a.shape[1:]
-            s = T.as_tensor_variable(s)
+            s = tt.as_tensor_variable(s)
        else:
-            s = T.as_tensor_variable(s)
+            s = tt.as_tensor_variable(s)
-            if s.dtype not in T.integer_dtypes:
+            if s.dtype not in tt.integer_dtypes:
                raise TypeError(
                    "%s: length of the transformed axis must be"
                    " of type integer" % self.__class__.__name__
@@ -54,7 +56,7 @@ class RFFTOp(gof.Op):
            + [slice(1, (s[-1] // 2) + (s[-1] % 2))]
            + [slice(None)]
        )
-        gout = T.set_subtensor(gout[idx], gout[idx] * 0.5)
+        gout = tt.set_subtensor(gout[idx], gout[idx] * 0.5)
        return [irfft_op(gout, s), DisconnectedType()()]
    def connection_pattern(self, node):
@@ -71,10 +73,10 @@ class IRFFTOp(gof.Op):
    def output_type(self, inp):
        # remove extra dim for real/imag
-        return T.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim - 1))
+        return tt.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim - 1))
    def make_node(self, a, s=None):
-        a = T.as_tensor_variable(a)
+        a = tt.as_tensor_variable(a)
        if a.ndim < 3:
            raise TypeError(
                "%s: input must have dimension >= 3,  with " % self.__class__.__name__
@@ -83,11 +85,11 @@ class IRFFTOp(gof.Op):
        if s is None:
            s = a.shape[1:-1]
-            s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+            s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2)
-            s = T.as_tensor_variable(s)
+            s = tt.as_tensor_variable(s)
        else:
-            s = T.as_tensor_variable(s)
+            s = tt.as_tensor_variable(s)
-            if s.dtype not in T.integer_dtypes:
+            if s.dtype not in tt.integer_dtypes:
                raise TypeError(
                    "%s: length of the transformed axis must be"
                    " of type integer" % self.__class__.__name__
@@ -117,7 +119,7 @@ class IRFFTOp(gof.Op):
            + [slice(1, (s[-1] // 2) + (s[-1] % 2))]
            + [slice(None)]
        )
-        gf = T.set_subtensor(gf[idx], gf[idx] * 2)
+        gf = tt.set_subtensor(gf[idx], gf[idx] * 2)
        return [gf, DisconnectedType()()]
    def connection_pattern(self, node):
@@ -157,7 +159,7 @@ def rfft(inp, norm=None):
    cond_norm = _unitary(norm)
    scaling = 1
    if cond_norm == "ortho":
-        scaling = T.sqrt(s.prod().astype(inp.dtype))
+        scaling = tt.sqrt(s.prod().astype(inp.dtype))
    return rfft_op(inp, s) / scaling
@@ -196,9 +198,9 @@ def irfft(inp, norm=None, is_odd=False):
    s = inp.shape[1:-1]
    if is_odd:
-        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
+        s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
    else:
-        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+        s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2)
    cond_norm = _unitary(norm)
    scaling = 1
@@ -206,7 +208,7 @@ def irfft(inp, norm=None, is_odd=False):
    if cond_norm is None:
        scaling = s.prod().astype(inp.dtype)
    elif cond_norm == "ortho":
-        scaling = T.sqrt(s.prod().astype(inp.dtype))
+        scaling = tt.sqrt(s.prod().astype(inp.dtype))
    return irfft_op(inp, s) / scaling

--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
 import numpy as np
 import theano
+import theano.tensor.basic as tt
 from theano import Apply, Op
 from theano.gof import local_optimizer
 from theano.gof.opt import copy_stack_trace
-from theano.tensor import as_tensor_variable, TensorType
+from theano.scalar import Composite, add, as_common_dtype, mul, sub, true_div
-from theano.tensor import basic as T
+from theano.tensor import TensorType, as_tensor_variable
+from theano.tensor.elemwise import Elemwise
 from theano.tensor.opt import register_specialize_device
-from theano.scalar import Composite, as_common_dtype
-from theano.scalar import add, sub, true_div, mul
 class BNComposite(Composite):
@@ -72,9 +74,7 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"):
        between implementation is likely to be less important on the full model fprop/bprop.
    """
    if mode == "low_mem":
-        elm_bn = theano.tensor.elemwise.Elemwise(
+        elm_bn = Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
-            scalar_op=BNComposite(dtype=inputs.dtype)
-        )
        rval = elm_bn(inputs, mean, std, gamma, beta)
    elif mode == "high_mem":
        rval = (inputs - mean) * (gamma / std) + beta
@@ -239,8 +239,8 @@ def batch_normalization_train(
        gamma = gamma.dimshuffle(params_dimshuffle_pattern)
        beta = beta.dimshuffle(params_dimshuffle_pattern)
    else:
-        gamma = T.addbroadcast(gamma, *axes)
+        gamma = tt.addbroadcast(gamma, *axes)
-        beta = T.addbroadcast(beta, *axes)
+        beta = tt.addbroadcast(beta, *axes)
    batchnorm_op = AbstractBatchNormTrain(axes=axes)
@@ -251,8 +251,8 @@ def batch_normalization_train(
            running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)
            running_var = running_var.dimshuffle(params_dimshuffle_pattern)
        else:
-            running_mean = T.addbroadcast(running_mean, *axes)
+            running_mean = tt.addbroadcast(running_mean, *axes)
-            running_var = T.addbroadcast(running_var, *axes)
+            running_var = tt.addbroadcast(running_var, *axes)
        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
            inputs,
            gamma,
@@ -263,11 +263,11 @@ def batch_normalization_train(
            running_var=running_var,
        )
        if new_running_mean.broadcastable != running_mean.broadcastable:
-            new_running_mean = T.patternbroadcast(
+            new_running_mean = tt.patternbroadcast(
                new_running_mean, running_mean.broadcastable
            )
        if new_running_var.broadcastable != running_var.broadcastable:
-            new_running_var = T.patternbroadcast(
+            new_running_var = tt.patternbroadcast(
                new_running_var, running_var.broadcastable
            )
        results = (out, mean, invstd, new_running_mean, new_running_var)
@@ -376,10 +376,10 @@ def batch_normalization_test(
        mean = mean.dimshuffle(params_dimshuffle_pattern)
        var = var.dimshuffle(params_dimshuffle_pattern)
    else:
-        gamma = T.addbroadcast(gamma, *axes)
+        gamma = tt.addbroadcast(gamma, *axes)
-        beta = T.addbroadcast(beta, *axes)
+        beta = tt.addbroadcast(beta, *axes)
-        mean = T.addbroadcast(mean, *axes)
+        mean = tt.addbroadcast(mean, *axes)
-        var = T.addbroadcast(var, *axes)
+        var = tt.addbroadcast(var, *axes)
    batchnorm_op = AbstractBatchNormInference(axes=axes)
    return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)
@@ -610,14 +610,13 @@ class AbstractBatchNormInference(Op):
            )
        scale, bias, est_mean, est_var = (
-            theano.tensor.addbroadcast(t, *axes)
+            tt.addbroadcast(t, *axes) for t in (scale, bias, est_mean, est_var)
-            for t in (scale, bias, est_mean, est_var)
        )
        # define helper expressions
        est_var_eps = est_var + epsilon
-        est_std = theano.tensor.sqrt(est_var_eps)
+        est_std = tt.sqrt(est_var_eps)
-        two = theano.tensor.constant(2.0)
+        two = tt.constant(2.0)
        # define and return gradients
        dx = dy * (scale / est_std)
@@ -673,7 +672,7 @@ class AbstractBatchNormTrainGrad(Op):
        ddinputs, ddscale, ddbias = grads
        x_diff = x - x_mean
-        mean_dy_x_diff = T.mean(dy * x_diff, axis=self.axes, keepdims=True)
+        mean_dy_x_diff = tt.mean(dy * x_diff, axis=self.axes, keepdims=True)
        # compute gradients given each of the output gradients
        g_wrt_x = 0
@@ -683,10 +682,10 @@ class AbstractBatchNormTrainGrad(Op):
        g_wrt_x_invstd = 0
        if not isinstance(ddinputs.type, theano.gradient.DisconnectedType):
-            ccc = scale * (ddinputs - T.mean(ddinputs, axis=self.axes, keepdims=True))
+            ccc = scale * (ddinputs - tt.mean(ddinputs, axis=self.axes, keepdims=True))
            ddd = (x_invstd ** 3) * (
-                ccc * T.mean(dy * x_diff, axis=self.axes, keepdims=True)
+                ccc * tt.mean(dy * x_diff, axis=self.axes, keepdims=True)
-                + dy * T.mean(ccc * x_diff, axis=self.axes, keepdims=True)
+                + dy * tt.mean(ccc * x_diff, axis=self.axes, keepdims=True)
            )
            g_wrt_x = g_wrt_x - ddd
@@ -695,19 +694,19 @@ class AbstractBatchNormTrainGrad(Op):
                - (
                    (x_invstd ** 3)
                    * x_diff
-                    * T.mean(ccc * x_diff, axis=self.axes, keepdims=True)
+                    * tt.mean(ccc * x_diff, axis=self.axes, keepdims=True)
                )
            )
            eee = (dy * x_invstd) - ((x_invstd ** 3) * x_diff * mean_dy_x_diff)
-            g_wrt_scale = g_wrt_scale + T.sum(
+            g_wrt_scale = g_wrt_scale + tt.sum(
-                ddinputs * (eee - T.mean(eee, axis=self.axes, keepdims=True)),
+                ddinputs * (eee - tt.mean(eee, axis=self.axes, keepdims=True)),
                axis=self.axes,
                keepdims=True,
            )
-            g_wrt_x_mean = g_wrt_x_mean + T.sum(ddd, axis=self.axes, keepdims=True)
+            g_wrt_x_mean = g_wrt_x_mean + tt.sum(ddd, axis=self.axes, keepdims=True)
-            g_wrt_x_invstd = g_wrt_x_invstd + T.sum(
+            g_wrt_x_invstd = g_wrt_x_invstd + tt.sum(
                ccc * (dy - 3 * (x_invstd ** 2) * x_diff * mean_dy_x_diff),
                axis=self.axes,
                keepdims=True,
@@ -717,14 +716,14 @@ class AbstractBatchNormTrainGrad(Op):
            g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy)
            g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff)
            g_wrt_x_mean = g_wrt_x_mean - (
-                x_invstd * ddscale * T.sum(dy, axis=self.axes, keepdims=True)
+                x_invstd * ddscale * tt.sum(dy, axis=self.axes, keepdims=True)
            )
            g_wrt_x_invstd = g_wrt_x_invstd + (
-                ddscale * T.sum(dy * x_diff, axis=self.axes, keepdims=True)
+                ddscale * tt.sum(dy * x_diff, axis=self.axes, keepdims=True)
            )
        if not isinstance(ddbias.type, theano.gradient.DisconnectedType):
-            g_wrt_dy = g_wrt_dy + T.fill(dy, ddbias)
+            g_wrt_dy = g_wrt_dy + tt.fill(dy, ddbias)
        # depending on which output gradients are given,
        # some inputs should be disconnected
@@ -804,7 +803,7 @@ def local_abstract_batch_norm_train(node):
    # The epsilon should not upcast the dtype.
    if var.dtype == "float32" and epsilon.dtype == "float64":
        epsilon = epsilon.astype("float32")
-    invstd = T.inv(T.sqrt(var + epsilon))
+    invstd = tt.inv(tt.sqrt(var + epsilon))
    out = (x - mean) * (scale * invstd) + bias
    results = [out, mean, invstd]
@@ -816,7 +815,7 @@ def local_abstract_batch_norm_train(node):
        )
        results.append(running_mean)
    if len(node.inputs) > 6:
-        m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+        m = tt.cast(tt.prod(x.shape) / tt.prod(scale.shape), theano.config.floatX)
        running_var = node.inputs[6]
        running_var = (
            running_var * (1.0 - running_average_factor)
@@ -825,7 +824,7 @@ def local_abstract_batch_norm_train(node):
        results.append(running_var)
    results = [
-        T.patternbroadcast(r, r_orig.broadcastable)
+        tt.patternbroadcast(r, r_orig.broadcastable)
        for (r, r_orig) in zip(results, node.outputs)
    ]
@@ -855,16 +854,16 @@ def local_abstract_batch_norm_train_grad(node):
        return None
    x_diff = x - x_mean
-    mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True)
+    mean_dy_x_diff = tt.mean(dy * x_diff, axis=axes, keepdims=True)
    c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))
-    g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True))
+    g_wrt_inputs = scale * (c - tt.mean(c, axis=axes, keepdims=True))
-    g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
+    g_wrt_scale = tt.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
-    g_wrt_bias = T.sum(dy, axis=axes, keepdims=True)
+    g_wrt_bias = tt.sum(dy, axis=axes, keepdims=True)
    results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
    results = [
-        T.patternbroadcast(r, r_orig.broadcastable)
+        tt.patternbroadcast(r, r_orig.broadcastable)
        for (r, r_orig) in zip(results, node.outputs)
    ]
@@ -896,9 +895,9 @@ def local_abstract_batch_norm_inference(node):
        epsilon = epsilon.astype("float32")
    result = (x - estimated_mean) * (
-        scale / T.sqrt(estimated_variance + epsilon)
+        scale / tt.sqrt(estimated_variance + epsilon)
    ) + bias
-    result = T.patternbroadcast(result, node.outputs[0].broadcastable)
+    result = tt.patternbroadcast(result, node.outputs[0].broadcastable)
    for var in theano.gof.graph.variables(node.inputs, [result]):
        if var not in node.inputs:

--- a/theano/tensor/nnet/ctc.py
+++ b/theano/tensor/nnet/ctc.py
 import os
 import sys
-import theano.tensor as T
-from theano import config
+import theano.tensor as tt
-from theano import gof
+from theano import config, gof
 from theano.gof import local_optimizer
 from theano.gof.cmodule import GCC_compiler
-from theano.tensor.opt import register_canonicalize
-from theano.tensor.extra_ops import cpu_contiguous
 from theano.gradient import grad_undefined
+from theano.tensor.extra_ops import cpu_contiguous
+from theano.tensor.opt import register_canonicalize
 def _ctc_find_lib():
@@ -156,12 +157,12 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
        return ["ctc.h"] + gof.OpenMPOp.c_headers(self)
    def make_node(self, activations, labels, input_lengths):
-        t_activations = T.as_tensor_variable(activations)
+        t_activations = tt.as_tensor_variable(activations)
        # Ensure activations array is C-contiguous
        t_activations = cpu_contiguous(t_activations)
-        t_labels = T.as_tensor_variable(labels)
+        t_labels = tt.as_tensor_variable(labels)
-        t_input_lengths = T.as_tensor_variable(input_lengths)
+        t_input_lengths = tt.as_tensor_variable(input_lengths)
        if t_activations.type.dtype != "float32":
            raise TypeError("activations must use the float32 type!")
@@ -181,10 +182,10 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
        if t_input_lengths.ndim != 1:
            raise ValueError("input_lengths must have 1 dimension.")
-        costs = T.fvector(name="ctc_cost")
+        costs = tt.fvector(name="ctc_cost")
        outputs = [costs]
        if self.compute_grad:
-            gradients = T.ftensor3(name="ctc_grad")
+            gradients = tt.ftensor3(name="ctc_grad")
            outputs += [gradients]
        return gof.Apply(
@@ -197,9 +198,9 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
        assert gradients is not None
        grad_op = output_grads[0]
-        total_grad = T.basic.batched_dot(
+        total_grad = tt.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(
-            grad_op, gradients.dimshuffle(1, 0, 2)
+            1, 0, 2
-        ).dimshuffle(1, 0, 2)
+        )
        return [
            total_grad,
            grad_undefined(self, 1, inputs[1]),

--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
@@ -2,16 +2,14 @@
 TODO: implement Images2Neibs.infer_shape() methods
 """
 import numpy as np
 import theano
-from theano import Op, Apply
+import theano.tensor as tt
+from theano import Apply, Op
 from theano.gof import EnumList
-import theano.tensor as T
+from theano.gradient import grad_not_implemented, grad_undefined
-from theano.gradient import grad_not_implemented
-from theano.gradient import grad_undefined
 class Images2Neibs(Op):
@@ -102,19 +100,19 @@ class Images2Neibs(Op):
                pattern.
        """
-        ten4 = T.as_tensor_variable(ten4)
+        ten4 = tt.as_tensor_variable(ten4)
-        neib_shape = T.as_tensor_variable(neib_shape)
+        neib_shape = tt.as_tensor_variable(neib_shape)
        if neib_step is None:
            neib_step = neib_shape
        else:
-            neib_step = T.as_tensor_variable(neib_step)
+            neib_step = tt.as_tensor_variable(neib_step)
        assert ten4.ndim == 4
        assert neib_shape.ndim == 1
        assert neib_step.ndim == 1
        return Apply(
-            self, [ten4, neib_shape, neib_step], [T.matrix(dtype=ten4.type.dtype)]
+            self, [ten4, neib_shape, neib_step], [tt.matrix(dtype=ten4.type.dtype)]
        )
    def grad(self, inp, grads):
@@ -165,14 +163,14 @@ class Images2Neibs(Op):
                    + ((rows - nrows) // rstep + 1,)
                    + ((cols - ncols) // cstep + 1,)
                )
-                return T.inc_subtensor(result_indices, pgz.reshape(newshape))
+                return tt.inc_subtensor(result_indices, pgz.reshape(newshape))
-            indices = T.arange(neib_shape[0] * neib_shape[1])
+            indices = tt.arange(neib_shape[0] * neib_shape[1])
            pgzs = gz.dimshuffle((1, 0))
            result, _ = theano.scan(
                fn=pos2map,
                sequences=[indices, pgzs],
-                outputs_info=T.zeros(x.shape),
+                outputs_info=tt.zeros(x.shape),
                non_sequences=[neib_shape, neib_step],
            )
            grad_input = result[-1]
@@ -354,8 +352,8 @@ class Images2Neibs(Op):
        c, d = node.inputs[1]
        step_x, step_y = node.inputs[2]
        if self.mode == "wrap_centered":
-            grid_c = T.ceil_intdiv(in_shape[2], step_x)
+            grid_c = tt.ceil_intdiv(in_shape[2], step_x)
-            grid_d = T.ceil_intdiv(in_shape[3], step_y)
+            grid_d = tt.ceil_intdiv(in_shape[3], step_y)
        elif self.mode == "valid":
            grid_c = 1 + ((in_shape[2] - c) // step_x)
            grid_d = 1 + ((in_shape[3] - d) // step_y)
@@ -795,11 +793,11 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
    .. note:: The code will output the initial image array.
    """
-    neibs = T.as_tensor_variable(neibs)
+    neibs = tt.as_tensor_variable(neibs)
-    neib_shape = T.as_tensor_variable(neib_shape)
+    neib_shape = tt.as_tensor_variable(neib_shape)
-    original_shape = T.as_tensor_variable(original_shape)
+    original_shape = tt.as_tensor_variable(original_shape)
-    new_neib_shape = T.stack([original_shape[-1] // neib_shape[1], neib_shape[1]])
+    new_neib_shape = tt.stack([original_shape[-1] // neib_shape[1], neib_shape[1]])
    output_2d = images2neibs(
        neibs.dimshuffle("x", "x", 0, 1), new_neib_shape, mode=mode
    )
@@ -809,10 +807,10 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
        # the shape and still raise error when it don't have the right
        # shape.
        valid_shape = original_shape
-        valid_shape = T.set_subtensor(
+        valid_shape = tt.set_subtensor(
            valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0]
        )
-        valid_shape = T.set_subtensor(
+        valid_shape = tt.set_subtensor(
            valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1]
        )
        output_4d = output_2d.reshape(valid_shape, ndim=4)
@@ -820,7 +818,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
        for d in [2, 3]:
            pad_shape = list(output_4d.shape)
            pad_shape[d] = original_shape[d] - valid_shape[d]
-            output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d)
+            output_4d = tt.concatenate([output_4d, tt.zeros(pad_shape)], axis=d)
    elif mode == "valid":
        # TODO: we do not implement all mode with this code.
        # Add a check for the good cases.

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py