Merge pull request #2455 from nouiz/meta_conv

In the meta optimizer, try to fix broadcast pattern change.

Merge pull request #2455 from nouiz/meta_conv
c907bf60 · abergeron · f3e50a69 · ab5564ab · c907bf60 · c907bf60
--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -948,16 +948,17 @@ class ModuleCache(object):
            if (key[0] and not key_broken and
                self.check_for_broken_eq):
                self.check_key(key, key_data.key_pkl)
-            self._update_mappings(key, key_data, module.__file__)
+            self._update_mappings(key, key_data, module.__file__, check_in_keys=not key_broken)
            return module
        else:
            return None

-    def _update_mappings(self, key, key_data, name):
+    def _update_mappings(self, key, key_data, name, check_in_keys):
        all_keys = key_data.keys
        if not all_keys:
            all_keys = [key]
-        assert key in all_keys
+        if check_in_keys:
+            assert key in all_keys
        for k in all_keys:
            if k in self.entry_from_key:
                assert self.entry_from_key[k] == name
@@ -988,10 +989,10 @@ class ModuleCache(object):
            key_pkl=key_pkl,
            entry=name)

+        key_broken = False
        if key[0]:
            try:
                key_data.save_pkl()
-                key_broken = False
            except cPickle.PicklingError:
                key_broken = True
                key_data.remove_key(key)
@@ -1006,7 +1007,7 @@ class ModuleCache(object):
                            " following op(s) implement"
                            " c_code_cache_version(). This makes them"
                            " recompiled for each process." + str(ops))
-        self._update_mappings(key, key_data, module.__file__)
+        self._update_mappings(key, key_data, module.__file__, not key_broken)
        return key_data

    def module_from_key(self, key, lnk=None, keep_lock=False):

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -595,3 +595,14 @@ if (py_%(name)s == NULL) { %(freefunc)s(%(name)s); }

    def __str__(self):
        return "%s{%s}" % (self.__class__.__name__, self.ctype)
+
+
+class CDataTypeConstant(graph.Constant):
+    def signature(self):
+        # The Op.c_code* methoss can't access the data, so it can't
+        # change the code depending of it. So there is no need to put
+        # it in the signature. Also, under Python 2, PyCObject aren't
+        # pickable. So using the PyCObject in the signature would
+        # disable the c code cache for op that have it as an input.
+        return (self.type,)
+CDataType.Constant = CDataTypeConstant
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1331,9 +1331,13 @@ if True:
            border_mode = node.op.border_mode
            subsample = node.op.subsample
            direction_hint = node.op.direction_hint
-            return [dnn_conv(img, kern,
-                             border_mode=border_mode, subsample=subsample,
-                             direction_hint=direction_hint)]
+            rval = dnn_conv(img, kern,
+                            border_mode=border_mode, subsample=subsample,
+                            direction_hint=direction_hint)
+            if node.outputs[0].broadcastable != rval.broadcastable:
+                rval = tensor.patternbroadcast(
+                    rval, node.outputs[0].type.broadcastable)
+            return [rval]

    # This optimizer is registered in opt.py as part of the meta-optimizer.
    # It tries exactly the opposite code path of what local_conv_dnn() uses,
@@ -1360,9 +1364,13 @@ if True:
                    direction_hint = 'forward'
                else:
                    direction_hint = 'bprop weights'
-            return [dnn_conv(img, kern,
-                             border_mode=border_mode, subsample=subsample,
-                             direction_hint=direction_hint)]
+            rval = dnn_conv(img, kern,
+                            border_mode=border_mode, subsample=subsample,
+                            direction_hint=direction_hint)
+            if node.outputs[0].broadcastable != rval.broadcastable:
+                rval = tensor.patternbroadcast(
+                    rval, node.outputs[0].type.broadcastable)
+            return [rval]

    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMax])

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1148,10 +1148,11 @@ def _gpu_conv_to_fftconv(node):
            (node.op.imshp[0] is not None)):
        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp
    rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
-    if ('image_shape' in kwargs) or ('filter_shape' in kwargs):
+    if node.outputs[0].broadcastable != rval.broadcastable:
        # With given shape information, conv2d_fft may return a different
        # broadcast pattern than GpuConv. This is forbidden, so we fix it.
-        rval = tensor.patternbroadcast(rval, node.outputs[0].type.broadcastable)
+        rval = tensor.patternbroadcast(
+            rval, node.outputs[0].type.broadcastable)
    return rval


@@ -1292,7 +1293,11 @@ def local_conv_gemm(node):
        if (border_mode == 'valid'):
            # need to flip the kernel for valid convolution
            kern = kern[:, :, ::-1, ::-1]
-            # call GpuCorrMM or GpuCorrMM_gradWeights
+            # By default use GpuCorrMM
+            rval = GpuCorrMM('valid', subsample, pad)(
+                gpu_contiguous(img), gpu_contiguous(kern))
+
+            # call GpuCorrMM_gradWeights if good
            # (the latter is faster if batchsize * kernelHeight * kernelWidth
            # is larger than inputChannels * outputHeight * outputWidth.
            # GpuConv does not always store information on the batchsize and
@@ -1317,21 +1322,23 @@ def local_conv_gemm(node):
                    # (we need to wrap the result in as_cuda_ndarray_variable,
                    # because we are not allowed to replace a CudaNdarray with
                    # a DimShuffle instance in a graph optimization)
-                    return [theano.sandbox.cuda.as_cuda_ndarray_variable(
-                            GpuCorrMM_gradWeights('valid', subsample, pad)(
+                    rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
+                        GpuCorrMM_gradWeights('valid', subsample, pad)(
                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
-                            ).dimshuffle(1, 0, 2, 3))]
-            # use GpuCorrMM if we did not choose GpuCorrMM_gradWeights above
-            return [GpuCorrMM('valid', subsample, pad)(
-                    gpu_contiguous(img), gpu_contiguous(kern))]
+                        ).dimshuffle(1, 0, 2, 3))
        elif (border_mode == 'full'):
            # need to dimshuffle the kernel for full convolution
            kern = kern.dimshuffle(1, 0, 2, 3)
            # call GpuCorrMM_gradInputs
-            return [GpuCorrMM_gradInputs('valid', subsample, pad)(
-                    gpu_contiguous(kern), gpu_contiguous(img))]
-
+            rval = GpuCorrMM_gradInputs('valid', subsample, pad)(
+                    gpu_contiguous(kern), gpu_contiguous(img))
+        if node.outputs[0].broadcastable != rval.broadcastable:
+            # With given shape information, conv2d_fft may return a different
+            # broadcast pattern than GpuConv. This is forbidden, so we fix it.
+            rval = tensor.patternbroadcast(
+                rval, node.outputs[0].type.broadcastable)
+        return [rval]

 # First we register the optimizer that moves convolutions to the GPU.
 register_opt()(local_gpu_conv)