GpuConv now adopts nkern and bsize from ConvOp, conv_gemm and conv_fft_*…

GpuConv now adopts nkern and bsize from ConvOp, conv_gemm and conv_fft_* optimizers use it if available

GpuConv now adopts nkern and bsize from ConvOp, conv_gemm and conv_fft_*…
1477ded8 · f0k · fe7d8bab · 1477ded8 · 1477ded8
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -972,6 +972,8 @@ class GpuConv(GpuOp):
            kshp=None,
            imshp=None,
            max_threads_dim0=None,
+            nkern=None,
+            bsize=None,
            fft_opt=True):
        """
        :param version: each version of c_code implements many kernel for the
@@ -991,7 +993,15 @@ class GpuConv(GpuOp):
        :param max_threads_dim0: The maximum number of threads for the
                        block size dimensions 0 (blockDim.x) used by the
                        GPU function.
-        :param fft_opt: desactivate fft_opt optimization at the op level when
+        :param nkern:   The number of kernels. Not used for this op, but can be
+                        used by graph optimizers to select a more optimal
+                        convolution implementation. If the GpuConv op is inserted
+                        automatically, we take its value from the Conv op.
+        :param bsize:   The batch size. Not used for this op, but can be
+                        used by graph optimizers to select a more optimal
+                        convolution implementation. If the GpuConv op is inserted
+                        automatically, we take its value from the Conv op.
+        :param fft_opt: deactivate fft_opt optimization at the op level when
                        set to False. Note that by default fft optimization
                        aren't enabled. See
                        :ref:`convolution documentation <libdoc_tensor_nnet_conv>`
@@ -1019,6 +1029,8 @@ class GpuConv(GpuOp):
        self.kshp = kshp
        self.imshp = imshp
        self.max_threads_dim0 = max_threads_dim0
+        self.nkern = nkern
+        self.bsize = bsize
        self.fft_opt = fft_opt

    def __eq__(self, other):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1122,6 +1122,8 @@ def local_gpu_conv(node):
                    version=op.version,
                    verbose=op.verbose,
                    imshp=op.imshp,
+                    nkern=op.nkern,
+                    bsize=op.bsize,
                    fft_opt=op.fft_opt
                    )
        if op.imshp_logical is not None:
@@ -1207,14 +1209,19 @@ def _gpu_conv_to_fftconv(node):
        node.op.imshp[-1] is not None and
        node.op.imshp[-1] % 2 == 1):
        kwargs['pad_last_dim'] = True
-    # TODO: If the user supplied the full nonsymbolic image_shape and
-    # filter_shape in conv2d(), we could pass it on to conv2d_fft(). However,
-    # information on batch size and channel counts is currently discarded
-    # when a ConvOp is replaced by a GpuConv, so this would need more changes.
-    #if (node.op.imshp is not None) and (None not in node.op.imshp):
-    #    kwargs['image_shape'] = (bsize, inchannels) + node.op.imshp
-    #if (node.op.kshp is not None) and (None not in node.op.kshp):
-    #    kwargs['filter_shape'] = (outchannels, inchannels) + node.op.kshp
+    # If the user supplied the full nonsymbolic image_shape and
+    # filter_shape in conv2d(), we can pass it on to conv2d_fft().
+    if ((node.op.imshp is not None) and
+            (len(node.op.imshp) == 3) and
+            (None not in node.op.imshp) and
+            (node.op.bsize is not None)):
+        kwargs['image_shape'] = (node.op.bsize,) + node.op.imshp
+    if ((node.op.kshp is not None) and
+            (None not in node.op.kshp) and
+            (node.op.nkern is not None) and
+            (len(node.op.imshp) == 3) and
+            (node.op.imshp[0] is not None)):
+        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp
    return conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)


@@ -1363,25 +1370,34 @@ def local_conv_gemm(node):
            # need to flip the kernel for valid convolution
            kern = kern[:, :, ::-1, ::-1]
            # call GpuCorrMM or GpuCorrMM_gradWeights
-            # (GpuCorrMM seems faster if batchsize * kernelHeight * kernelWidth
-            # is smaller than inputChannels * outputHeight * outputWidth.
-            # GpuConv does not store information on the batchsize and not always
-            # on the channels, so we only use what information we have.)
+            # (the latter is faster if batchsize * kernelHeight * kernelWidth
+            # is larger than inputChannels * outputHeight * outputWidth.
+            # GpuConv does not always store information on the batchsize and
+            # channels, though, so we only use what information we have.)
            if ((subsample == (1,1)) and
                    (node.op.imshp is not None) and
                    (None not in node.op.imshp[-2:]) and
                    (node.op.kshp is not None) and
-                    (None not in node.op.kshp) and
-                    (node.op.kshp[0] * node.op.kshp[1] >
-                    (node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                    (node.op.imshp[-1] - node.op.kshp[1] + 1))):
-                return [gpu_contiguous(GpuCorrMM_gradWeights('valid', subsample, pad)(
+                    (None not in node.op.kshp)):
+                # we know the kernel and output size
+                prod1 = node.op.kshp[0] * node.op.kshp[1]
+                prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                    (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                if ((node.op.bsize is not None) and
+                        (len(node.op.imshp) == 3) and
+                        (node.op.imshp[0] is not None)):
+                    # we also know batchsize and input channels
+                    prod1 *= node.op.bsize
+                    prod2 *= node.op.imshp[0]
+                # compare to decide
+                if prod1 > prod2:
+                    return [gpu_contiguous(GpuCorrMM_gradWeights('valid', subsample, pad)(
                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
-                        ).dimshuffle(1, 0, 2, 3))]
-            else:
-                return [GpuCorrMM('valid', subsample, pad)(
-                        gpu_contiguous(img), gpu_contiguous(kern))]
+                            ).dimshuffle(1, 0, 2, 3))]
+            # use GpuCorrMM if we did not choose GpuCorrMM_gradWeights above
+            return [GpuCorrMM('valid', subsample, pad)(
+                    gpu_contiguous(img), gpu_contiguous(kern))]
        elif (border_mode == 'full'):
            # need to dimshuffle the kernel for full convolution
            kern = kern.dimshuffle(1, 0, 2, 3)