conv_gemm optimizer chooses between GpuCorrMM and GpuCorrMM_gradWeights for valid convolution

e181b34f · f0k · c46a0243 · e181b34f
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1362,8 +1362,24 @@ def local_conv_gemm(node):
        if (border_mode == 'valid'):
            # need to flip the kernel for valid convolution
            kern = kern[:, :, ::-1, ::-1]
-            # call GpuCorrMM
+            # call GpuCorrMM or GpuCorrMM_gradWeights
-            # TODO: call GpuCorrMM_gradWeights instead if appropriate
+            # (GpuCorrMM seems faster if batchsize * kernelHeight * kernelWidth
+            # is smaller than inputChannels * outputHeight * outputWidth.
+            # GpuConv does not store information on the batchsize and not always
+            # on the channels, so we only use what information we have.)
+            if ((subsample == (1,1)) and
+                    (node.op.imshp is not None) and
+                    (None not in node.op.imshp[-2:]) and
+                    (node.op.kshp is not None) and
+                    (None not in node.op.kshp) and
+                    (node.op.kshp[0] * node.op.kshp[1] >
+                    (node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                    (node.op.imshp[-1] - node.op.kshp[1] + 1))):
+                return [gpu_contiguous(GpuCorrMM_gradWeights('valid', subsample, pad)(
+                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
+                        ).dimshuffle(1, 0, 2, 3))]
+            else:
                return [GpuCorrMM('valid', subsample, pad)(
                        gpu_contiguous(img), gpu_contiguous(kern))]
        elif (border_mode == 'full'):