Merge pull request #2298 from nouiz/convdnn_broadcast

Fix broadcast pattern of dnn conv op.

Merge pull request #2298 from nouiz/convdnn_broadcast
5c9e0b31 · abergeron · ad4b5528 · ac315f60 · 5c9e0b31 · 5c9e0b31
--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -342,8 +342,18 @@ def get_c_extract(r, name, sub):
    """Wrapper around c_extract that initializes py_name from storage."""
    if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
            r.clients]):
+        # check_broadcast is just an hack to easily remove just the
-        c_extract = r.type.c_extract(name, sub, True)
+        # broadcast check on the old GPU back-end. THis check isn't
+        # done in the new GPU back-end or on the CPU.
+        if hasattr(c.op, 'check_broadcast'):
+            try:
+                c_extract = r.type.c_extract(
+                    name, sub, True,
+                    check_broadcast=c.op.check_broadcast)
+            except TypeError, e:
+                c_extract = r.type.c_extract(name, sub, True)
+        else:
+            c_extract = r.type.c_extract(name, sub, True)
    else:
        c_extract = r.type.c_extract(name, sub, False)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -298,6 +298,8 @@ class GpuDimShuffle(GpuOp):
    """
    Implement DimShuffle on the gpu.
    """
+    check_broadcast = False
    def __init__(self, input_broadcastable, new_order):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
@@ -2355,6 +2357,8 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
    """
    Implement subtensor on the gpu.
    """
+    check_broadcast = False
    # __hash__, __eq__, __str__ come from tensor.Subtensor
    def make_node(self, x, *inputs):
        assert isinstance(x.type, CudaNdarrayType)
@@ -3352,6 +3356,7 @@ class GpuContiguous(GpuOp):
    not already c contiguous.
    """
    view_map = {0: [0]}
+    check_input = False
    def __eq__(self, other):
        return type(self) == type(other)

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -513,8 +513,9 @@ class BaseGpuCorrMM(GpuOp):
        integers
    :param subsample: perform subsampling of the output (default: (1, 1))
    :param pad: *deprecated*, now you should always use border_mode
    """
+    check_broadcast = False
    def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
        if pad != (0, 0):
@@ -1498,6 +1499,8 @@ class GpuConv(GpuOp):
    """
    Implement the batched and stacked 2d convolution on the gpu.
    """
+    check_broadcast = False
    @staticmethod
    def logical_output_shape_2d(imshp, kshp, mode):
        if mode == 'valid':

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -7,7 +7,7 @@ from theano.gof.type import CDataType
 from theano.compat import PY3
 from theano.tensor.nnet import SoftmaxGrad
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda import (GpuOp, cuda_available)
+from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous, HostFromGpu)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
@@ -244,23 +244,7 @@ class GpuDnnConvDesc(GpuOp):
 class GpuDnnConvBase(DnnBase):
    __props__ = ()
+    check_broadcast = False
-    def make_node(self, img, kern, desc):
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
-            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
-        broadcastable = (img.type.broadcastable[0],
-                         kern.type.broadcastable[0],
-                         False, False)
-        return Apply(self, [img, kern, desc],
-                     [CudaNdarrayType(broadcastable)()])
    def c_support_code_struct(self, node, struct_id):
        return """
@@ -417,6 +401,24 @@ class GpuDnnConv(GpuDnnConvBase):
    conv_op = 'cudnnConvolutionForward'
    path_flag = 'CUDNN_CONVOLUTION_FWD'
+    def make_node(self, img, kern, desc):
+        img = as_cuda_ndarray_variable(img)
+        kern = as_cuda_ndarray_variable(kern)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        broadcastable = (img.type.broadcastable[0],
+                         kern.type.broadcastable[0],
+                         False, False)
+        return Apply(self, [img, kern, desc],
+                     [CudaNdarrayType(broadcastable)()])
    def grad(self, inp, grads):
        img, kerns, desc = inp
        top, = grads
@@ -464,6 +466,24 @@ class GpuDnnConvGradW(GpuDnnConvBase):
        # not connected to desc
        return [[1], [1], [0]]
+    def make_node(self, img, topgrad, desc):
+        img = as_cuda_ndarray_variable(img)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        broadcastable = [topgrad.type.broadcastable[1],
+                         img.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [img, topgrad, desc],
+                     [CudaNdarrayType(broadcastable)()])
 class GpuDnnConvGradI(GpuDnnConvBase):
    """
@@ -496,6 +516,24 @@ class GpuDnnConvGradI(GpuDnnConvBase):
        # not connected to desc
        return [[1], [1], [0]]
+    def make_node(self, kern, topgrad, desc):
+        kern = as_cuda_ndarray_variable(kern)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        broadcastable = [topgrad.type.broadcastable[0],
+                         kern.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [kern, topgrad, desc],
+                     [CudaNdarrayType(broadcastable)()])
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
             conv_mode='conv', direction_hint=None):

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -35,8 +35,6 @@ if theano.config.mode == 'FAST_COMPILE':
 else:
    theano_mode = theano.compile.mode.get_default_mode().including('gpu')
-cuda_tensor4 = cuda.CudaNdarrayType([False] * 4)
 device_id = theano.sandbox.cuda.use.device_number
 if device_id is None:
    cuda.shared_constructor(numpy.zeros(2, dtype='float32'))
@@ -189,13 +187,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
    t0 = time.time()
    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
    t1 = time.time()
-    i = cuda_tensor4()
+    i = cuda.CudaNdarrayType(
-    k = cuda_tensor4()
+        broadcastable=[sh == 1 for sh in npy_img.shape])()
+    k = cuda.CudaNdarrayType(
+        broadcastable=[sh == 1 for sh in npy_kern.shape])()
    op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
                                          subsample=subsample,
                                          version=version,
                                          verbose=verbose,
                                          kshp=compile_kshp)(i, k)
+    assert [(sh == 1) is br for
+            sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])]
    f = theano.function([i, k], op, mode=theano_mode)
    if cls is not None:
        assert any([isinstance(node.op, cls)
@@ -905,8 +907,10 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-    i = cuda_tensor4()
+    i = cuda.CudaNdarrayType(
-    k = cuda_tensor4()
+        broadcastable=[sh == 1 for sh in npy_img.shape])()
+    k = cuda.CudaNdarrayType(
+        broadcastable=[sh == 1 for sh in npy_kern.shape])()
    if direction == 'fprop':
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
@@ -971,8 +975,10 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-    i = cuda_tensor4()
+    i = cuda.CudaNdarrayType(
-    k = cuda_tensor4()
+        broadcastable=[sh == 1 for sh in npy_img.shape])()
+    k = cuda.CudaNdarrayType(
+        broadcastable=[sh == 1 for sh in npy_kern.shape])()
    # TODO: also test custom pad values
    corr_op = op(mode, subsample)(i, k)
@@ -1009,9 +1015,12 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
    allvals = f(npy_img, npy_kern)
-    for a, b, p in zip(allvals[::2], allvals[1::2],
+    for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2],
-                       ('top', 'dtop/dbottom', 'dtop/dweight',
+                               outputs[::2], outputs[1::2],
-                        'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
+                               ('top', 'dtop/dbottom', 'dtop/dweight',
+                                'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
+        assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]
        assert_allclose(a, b, rtol=1e-4)

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -7,7 +7,6 @@ import warnings
 import numpy
 import theano
-from theano import config
 from theano import Type, Variable
 from theano import tensor, config
 from theano import scalar as scal
@@ -280,7 +279,8 @@ class CudaNdarrayType(Type):
    def c_init(self, name, sub):
        return "%(name)s = NULL;" % locals()
-    def c_extract(self, name, sub, check_input=True):
+    def c_extract(self, name, sub, check_input=True,
+                  check_broadcast=True):
        sio = StringIO()
        fail = sub['fail']
        nd = self.ndim
@@ -307,7 +307,7 @@ class CudaNdarrayType(Type):
                //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
            """ % locals()
            for i, b in enumerate(self.broadcastable):
-                if b:
+                if b and check_broadcast:
                    print >> sio, """
                if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
                {

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -547,7 +547,9 @@ get_scalar_constant_value_elemwises = (
 def get_scalar_constant_value(orig_v, elemwise=True):
    """return the constant scalar(0-D) value underlying variable `v`
-    If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
+    If v is the output of dimshuffles, fills, allocs, rebroadcasts,
+    cast, OutputGuard, DeepCopyOp, ScalarFromTensor, ScalarOp,
+    Elemwise and some pattern with Subtensor,
    this function digs through them.
    If `v` is not some view of constant scalar data, then raise a
@@ -587,12 +589,14 @@ def get_scalar_constant_value(orig_v, elemwise=True):
                continue
            elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
                if isinstance(v.owner.inputs[0], Constant):
-                    return v.owner.inputs[0].data.shape[v.owner.op.i]
+                    return numpy.asarray(v.owner.inputs[0].data.shape[v.owner.op.i])
            # Don't act as the constant_folding optimization here as this
            # fct is used too early in the optimization phase.  This would
            # mess with the stabilization optimization and be too slow.
            # We put all the scalar Ops used by get_canonical_form_slice()
            # to allow it to determine the broadcast pattern correctly.
+            elif isinstance(v.owner.op, ScalarFromTensor):
+                return get_scalar_constant_value(v.owner.inputs[0])
            elif isinstance(v.owner.op, scal.ScalarOp):
                if isinstance(v.owner.op, scal.Second):
                    # We don't need both input to be constant for second
@@ -3504,9 +3508,9 @@ class Join(Op):
        # Join op should get at least one input to join
        assert len(ishapes) > 1
        n_dim = len(ishapes[1])
-        for shape in ishapes[1:]:
+        for shp in ishapes[1:]:
-            assert shape is not None
+            assert shp is not None
-            assert len(shape) == n_dim
+            assert len(shp) == n_dim
        out_shapes = []
        for dim in xrange(n_dim):
@@ -3522,8 +3526,8 @@ class Join(Op):
            t_side = ishapes[1][dim]
            f_side = ishapes[1][dim]
            # loop over tensors and sum for the joining dimension
-            for shape in ishapes[2:]:
+            for shp in ishapes[2:]:
-                t_side = t_side + shape[dim]
+                t_side = t_side + shp[dim]
            # return the dimensions found
            out_shapes.append(switch(eq(dim, node.inputs[0]),
                              t_side, f_side))

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -16,7 +16,7 @@ from theano.gof.python25 import maxsize
 from theano.printing import pprint
 from theano import scalar as scal
 from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value,
-                                 ARange, TensorType)
+                                 ARange, TensorType, NotScalarConstantError)
 from theano.tensor.elemwise import DimShuffle
 from theano.tensor.type_other import NoneConst, SliceType, make_slice
 from theano import config
@@ -470,15 +470,22 @@ class Subtensor(Op):
        broadcastable = []
        for i, (p, bc) in enumerate(izip(padded, x.type.broadcastable)):
            if isinstance(p, slice):
-                if bc and p.start in [None, 0]:
+                if bc:
                    start = p.start
-                    if start is None:
+                    try:
-                        start = 0
+                        start = get_scalar_constant_value(start)
-                    if (p.stop is None or
+                    except NotScalarConstantError:
-                        (isinstance(p.stop, (int, numpy.integer)) and
+                        pass
-                         p.stop > start)):
+                    if start in [None, 0]:
-                        broadcastable.append(True)
+                        start = p.start
-                        continue
+                        if start is None:
+                            start = 0
+                        if (p.stop is None or
+                            (isinstance(p.stop, (int, numpy.integer,
+                                                 numpy.ndarray)) and
+                             p.stop > start)):
+                            broadcastable.append(True)
+                            continue
                broadcastable.append(False)