Merge pull request #2665 from ballasn/conv2d_interface

New conv2d interface (work in progress)

Merge pull request #2665 from ballasn/conv2d_interface
4736c9b3 · Pascal Lamblin · 8d3a67b7 · 10f87868 · 4736c9b3 · 4736c9b3
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.opt import register_specialize_device
+from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.tensor.nnet.abstract_conv2d import (AbstractConv2d,
+                                                AbstractConv2d_gradWeights,
+                                                AbstractConv2d_gradInputs)
+from theano.tensor.opt import register_specialize_device
 def dnn_available():
    if dnn_available.avail is None:
@@ -1276,6 +1285,58 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    return GpuDnnConv3d(algo=algo)(img, kerns, out, desc)
+def dnn_gradweight(img, topgrad,
+                   kerns_shp,
+                   border_mode='valid', subsample=(1, 1),
+                   conv_mode='conv'):
+    """
+    GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
+    The memory layout to use is 'bc01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+    FIXME parameters doc
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+    img = gpu_contiguous(img)
+    topgrad = gpu_contiguous(topgrad)
+    kerns_shp = theano.tensor.as_tensor_variable(kerns_shp) 
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns_shp)
+    out = gpu_alloc_empty(*kerns_shp)
+    return GpuDnnConvGradW()(img, topgrad, out, desc)
+def dnn_gradinput(kerns, topgrad,
+                  img_shp,
+                  border_mode='valid', subsample=(1, 1),
+                  conv_mode='conv'):
+    """
+    GPU convolution gradient with respect to input using cuDNN from NVIDIA.
+    The memory layout to use is 'bc01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+    FIXME parameters doc
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+    kerns = gpu_contiguous(kerns)
+    topgrad = gpu_contiguous(topgrad)
+    img_shp = theano.tensor.as_tensor_variable(img_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img_shp, kerns.shape)
+    out = gpu_alloc_empty(*img_shp)
+    return GpuDnnConvGradI()(kerns, topgrad, out, desc)
 class GpuDnnPoolDesc(GpuOp):
    """
    This Op builds a pooling descriptor for use in the other pooling operations.
@@ -2383,3 +2444,47 @@ if True:
                gpu_contiguous(ins[1])
            )
            return [out.dimshuffle(0, 1)]
+### AbstractConv Optimizations
+@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
+def local_abstractconv_cudnn(node):
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+    if ((not isinstance(node.op, AbstractConv2d) or
+         not isinstance(node.op, AbstractConv2d_gradWeights) or
+         not isinstance(node.op, AbstractConv2d_gradInputs))):
+        return None
+    if not isinstance(inp1.type, CudaNdarrayType) or \
+            not isinstance(inp2.type, CudaNdarrayType):
+        return None
+    if not dnn_available():
+        return None
+    if node.op.filters_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+    if (isinstance(node.op, AbstractConv2d)):
+        rval = dnn_conv(inp1, inp2,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='forward',
+                        conv_mode = conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv2d_gradWeights)):
+        shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradweight(inp1, inp2, shape,
+                              border_mode=node.op.border_mode,
+                              subsample=node.op.subsample,
+                              conv_mode = conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv2d_gradInputs)):
+        shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradinput(inp1, inp2, shape,
+                             border_mode=node.op.border_mode,
+                             subsample=node.op.subsample,
+                             conv_mode = conv_mode)
+        return [rval]
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -75,6 +75,12 @@ from theano.tensor import slinalg
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tests.breakpoint import PdbBreakpoint
+from theano.tensor.nnet.abstract_conv2d import (BaseAbstractConv2d, AbstractConv2d,
+                                                AbstractConv2d_gradWeights,
+                                                AbstractConv2d_gradInputs)
+from theano.tensor.opt import register_specialize_device
 try:
    # We need to be able to import this file even if cuda isn't avail.
    from theano.sandbox.cuda import device_properties
@@ -2622,3 +2628,179 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
 import theano.sandbox.cuda.extra_ops
+### Move to Gpu optimization
+@local_optimizer([gpu_from_host,
+                  AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
+def local_conv2d_gpu_conv(node):
+    """
+    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
+    AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
+    """
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if host_input.owner and  isinstance(host_input.owner.op, BaseAbstractConv2d):
+            conv = host_input.owner.op
+            inps = list(host_input.owner.inputs)
+            inps[0] = as_cuda_ndarray_variable(inps[0])
+            inps[1] = as_cuda_ndarray_variable(inps[1])
+            out = conv(*inps)
+            # out is on the GPU because both inputs are.
+            out = theano.tensor.patternbroadcast(out,
+                                                 node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx_high_tol
+            return [out]
+    if isinstance(node.op, BaseAbstractConv2d):
+        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+        inp1 = node.inputs[0]
+        inp2 = node.inputs[1]
+        if ((isinstance(inp1.type, CudaNdarrayType) and
+             isinstance(inp2.type, CudaNdarrayType))):
+            # Both inputs are already directly on the GPU, nothing to do
+            return
+        inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
+                       (inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
+        inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
+                       (inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
+        if inp1_on_gpu or inp2_on_gpu:
+            conv = node.op
+            inps = list(node.inputs)
+            inps[0] = as_cuda_ndarray_variable(inps[0])
+            inps[1] = as_cuda_ndarray_variable(inps[1])
+            out = conv(*inps)
+            # out is on the GPU because both inputs are.
+            out = theano.tensor.patternbroadcast(
+                out,
+                node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx_high_tol
+            # If the original output was on CPU, we have to transfer it
+            if isinstance(node.outputs[0].type, tensor.TensorType):
+                return [tensor.as_tensor_variable(out)]
+            else:
+                return [out]
+register_opt()(local_conv2d_gpu_conv)
+### Corrmm opt
+@local_optimizer([AbstractConv2d])
+def local_abstractconv_gemm(node):
+    if not isinstance(node.op, AbstractConv2d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, CudaNdarrayType) or
+            not isinstance(kern.type, CudaNdarrayType)):
+        return None
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    if (border_mode == 'full') and (subsample == (1, 1)):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+        # need to dimshuffle the kernel for full convolution
+        kern = kern.dimshuffle(1, 0, 2, 3)
+        # call GpuCorrMM_gradInputs
+        rval = GpuCorrMM_gradInputs('valid', subsample)(
+                gpu_contiguous(kern), gpu_contiguous(img))
+    else:
+        # need to flip the kernel if necessary
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+        # By default use GpuCorrMM
+        rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
+                                                 gpu_contiguous(kern))
+        # call GpuCorrMM_gradWeights if good
+        # (the latter is faster if batchsize * kernelHeight * kernelWidth
+        # is larger than inputChannels * outputHeight * outputWidth.
+        # GpuConv does not always store information on the batchsize and
+        # channels, though, so we only use what information we have.)
+        if ((subsample == (1,1)) and
+            (node.op.imshp is not None) and
+            (None not in node.op.imshp[-2:]) and
+            (node.op.kshp is not None) and
+            (None not in node.op.kshp)):
+            # we know the kernel and output size
+            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+            if (None not in node.op.imshp[:1]):
+                # we also know batchsize and input channels
+                prod1 *= node.op.imshp[0]
+                prod2 *= node.op.imshp[1]
+            # compare to decide
+            if prod1 > prod2:
+                # (we need to wrap the result in as_cuda_ndarray_variable,
+                # because we are not allowed to replace a CudaNdarray with
+                # a DimShuffle instance in a graph optimization)
+                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
+                    GpuCorrMM_gradWeights(border_mode, subsample)(
+                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
+                    ).dimshuffle(1, 0, 2, 3))
+    return [rval]
+@local_optimizer([AbstractConv2d_gradWeights])
+def local_abstractconv_gradweight_gemm(node):
+    if not isinstance(node.op, AbstractConv2d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
+                                 subsample=node.op.subsample)(
+        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1]
+    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    rval = as_cuda_ndarray_variable(rval)
+    return [rval]
+@local_optimizer([AbstractConv2d_gradInputs])
+def local_abstractconv_gradinputs_gemm(node):
+    if not isinstance(node.op, AbstractConv2d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1]
+    rval =  GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
+    subsample=node.op.subsample)(
+        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
+    return [rval]
+# Register GPU convolution implementation
+# They are tried in a specific order so we can control
+# which ones take precedence over others.
+abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
+abstractconv_groupopt.__name__ = "gpu_abstractconv_opts"
+register_specialize_device(abstractconv_groupopt, 'gpu', 'fast_compile')
+# cuDNN is first, but only registered if cuDNN is available.
+conv_groupopt.register('local_abstractconv_dnn', dnn.local_abstractconv_cudnn, 20,
+                       'conv_dnn',
+                       'gpu', 'fast_compile', 'fast_run', 'cudnn')
+# The GEMM-based convolution comes last to catch all remaining cases.
+# It can be disabled by excluding 'conv_gemm'.
+conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv_gradweight_gemm',
+                       local_abstractconv_gradweight_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv_gradinputs_gemm',
+                       local_abstractconv_gradinputs_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
+import unittest
+import numpy
+import itertools
+import theano
+from theano.tests import unittest_tools as utt
+import theano.tensor.nnet.abstract_conv2d as conv
+from theano.sandbox.cuda import float32_shared_constructor as gpu_shared
+from theano.compile import shared as cpu_shared
+from theano.sandbox.cuda.dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda
+if not cuda.cuda_available:
+    raise SkipTest('Optional package cuda disabled')
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+    mode_without_gpu = theano.compile.get_default_mode().excluding('gpu')
+class TestConv2d(unittest.TestCase):
+    def setUp(self):
+        super(TestConv2d, self).setUp()
+        self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
+                              (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
+        self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
+                               (1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
+        self.subsamples = [(1, 1), (2, 2), (2, 4)]
+        self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
+        self.filter_flip = [True, False]
+    def get_output_shape(self, inputs_shape, filters_shape, subsample, border_mode):
+        if border_mode == "valid":
+            border_mode = (0, 0)
+        if border_mode == "full":
+            border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
+        batch_size = inputs_shape[0]
+        num_filters = filters_shape[0]
+        return (batch_size, num_filters,) \
+            + tuple(None if i is None or k is None
+                    else ((i + 2 * pad - k) // d + 1)
+                    for i, k, d, pad in zip(inputs_shape[2:], filters_shape[2:],
+                                            subsample, border_mode))
+    def run_fwd(self, inputs_shape, filters_shape, ref=dnn_conv,
+                subsample=(1, 1), verify_grad=True, mode=mode_without_gpu,
+                border_mode='valid', filter_flip=True, device='cpu', provide_shape=False):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        if device == 'gpu':
+            inputs = gpu_shared(inputs_val)
+            filters = gpu_shared(filters_val)
+        else:
+            inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
+            filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
+        if provide_shape:
+            imshp = inputs_shape
+            kshp = filters_shape
+        else:
+            imshp = None
+            kshp = None
+        if filter_flip:
+            conv_mode = 'conv'
+        else:
+            conv_mode = 'cross'
+        c_ref = ref(inputs, filters,
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    conv_mode=conv_mode)
+        c = conv.conv2d(inputs, filters,
+                        border_mode=border_mode,
+                        subsample=subsample,
+                        filter_flip=filter_flip,
+                        input_shape=imshp,
+                        filter_shape=kshp)
+        f_ref = theano.function([], c_ref, mode=mode)
+        f = theano.function([], c, mode)
+        res_ref = numpy.array(f_ref())
+        res = numpy.array(f())
+        utt.assert_allclose(res_ref, res)
+        if verify_grad:
+            utt.verify_grad(conv.AbstractConv2d(border_mode="valid", imshp=imshp, kshp=kshp,
+                                                subsample=subsample),
+                            [inputs_val, filters_val],
+                            mode=mode)
+    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
+                       ref=dnn_gradweight, subsample=(1, 1), filter_flip=True,
+                       verify_grad=True, mode=mode_without_gpu, border_mode='valid',
+                       device='cpu', provide_shape=False):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        output_val = numpy.random.random(output_shape).astype('float32')
+        if device == 'gpu':
+            inputs = gpu_shared(inputs_val)
+            output = gpu_shared(output_val)
+        else:
+            inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
+            output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
+        if provide_shape:
+            imshp = inputs_shape
+            kshp = filters_shape
+        else:
+            imshp = None
+            kshp = None
+        if filter_flip:
+            conv_mode = 'conv'
+        else:
+            conv_mode = 'cross'
+        c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
+                                            filter_flip=filter_flip,
+                                            subsample=subsample,
+                                            imshp=imshp, kshp=kshp)
+        c = c(inputs, output, filters_shape[-2:])
+        c_ref = ref(inputs, output,
+                    filters_shape,
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    conv_mode=conv_mode)
+        f = theano.function([], c, mode)
+        f_ref = theano.function([], c_ref, mode)
+        res_ref = numpy.array(f_ref())
+        res = numpy.array(f())
+        utt.assert_allclose(res_ref, res)
+        def abstract_conv2d_gradweight(inputs_val, output_val):
+            conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode, subsample=subsample)
+            return conv_op(inputs_val, output_val, filters_shape[-2:])
+        if verify_grad:
+            utt.verify_grad(abstract_conv2d_gradweight, [inputs_val, output_val],
+                            mode=mode, eps=1)
+    def run_gradinput(self, inputs_shape, filters_shape, output_shape, ref=dnn_gradinput,
+                      subsample=(1, 1), filter_flip=True, verify_grad=True, mode=mode_without_gpu,
+                      border_mode='valid', device='cpu', provide_shape=False):
+        output_val = numpy.random.random(output_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        if device == 'gpu':
+            output = gpu_shared(output_val)
+            filters = gpu_shared(filters_val)
+        else:
+            output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
+            filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
+        if provide_shape:
+            imshp = inputs_shape
+            kshp = filters_shape
+        else:
+            imshp = None
+            kshp = None
+        if filter_flip:
+            conv_mode = 'conv'
+        else:
+            conv_mode = 'cross'
+        c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
+                                           subsample=subsample,
+                                           filter_flip=filter_flip,
+                                           imshp=imshp, kshp=kshp)
+        c = c(filters, output, inputs_shape[-2:])
+        c_ref = ref(filters, output, inputs_shape,
+                    border_mode=border_mode, subsample=subsample,
+                    conv_mode=conv_mode)
+        f = theano.function([], c, mode)
+        f_ref = theano.function([], c_ref, mode)
+        res_ref = numpy.array(f_ref())
+        res = numpy.array(f())
+        utt.assert_allclose(res_ref, res)
+        def abstract_conv2d_gradinputs(filters_val, output_val):
+            conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode, subsample=subsample)
+            return conv_op(filters_val, output_val, inputs_shape[-2:])
+        if verify_grad:
+            utt.verify_grad(abstract_conv2d_gradinputs, [filters_val, output_val],
+                            mode=mode, eps=1)
+    def test_dnn_conv(self):
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+        # provide_shape is not used by the CuDNN impementation
+        provide_shape = False
+        for (i, f), s, b, flip in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip):
+            o = self.get_output_shape(i, f, s, b)
+            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                         verify_grad=True, mode=mode, device='gpu',
+                         provide_shape=provide_shape, border_mode=b,
+                         filter_flip=flip)
+            self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                output_shape=o, subsample=s,
+                                verify_grad=True, mode=mode, device='gpu',
+                                provide_shape=provide_shape, border_mode=b,
+                                filter_flip=flip)
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode, device='gpu',
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip)
+    def test_cormm_conv(self):
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu.excluding('cudnn')
+        for (i, f), s, b, flip, provide_shape in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip,
+                [False, True]):
+            o = self.get_output_shape(i, f, s, b)
+            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                         verify_grad=True, mode=mode, device='gpu',
+                         provide_shape=provide_shape, border_mode=b,
+                         filter_flip=flip)
+            self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                output_shape=o, subsample=s,
+                                verify_grad=True, mode=mode, device='gpu',
+                                provide_shape=provide_shape, border_mode=b,
+                                filter_flip=flip)
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode, device='gpu',
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip)
+    def test_cpu_conv(self):
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_without_gpu
+        for (i, f), s, b, flip, provide_shape in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip,
+                [False, True]):
+            o = self.get_output_shape(i, f, s, b)
+            fwd_OK = True
+            gradweight_OK = True
+            gradinput_OK = True
+            if not flip:
+                fwd_OK = False
+                gradweight_OK = False
+                gradinput_OK = False
+            if b not in ('valid', 'full'):
+                fwd_OK = False
+                gradweight_OK = False
+                gradinput_OK = False
+            if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
+                gradweight_OK = False
+                gradinput_OK = False
+            if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
+                gradweight_OK = False
+                gradinput_OK = False
+            if fwd_OK:
+                self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                             verify_grad=True, mode=mode, device='cpu',
+                             provide_shape=provide_shape, border_mode=b,
+                             filter_flip=flip)
+            else:
+                self.assertRaises(NotImplementedError,
+                                  self.run_fwd,
+                                  inputs_shape=i,
+                                  filters_shape=f,
+                                  subsample=s,
+                                  verify_grad=False,
+                                  mode=mode,
+                                  device='cpu',
+                                  provide_shape=provide_shape,
+                                  border_mode=b,
+                                  filter_flip=flip)
+            if gradweight_OK:
+                self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                    output_shape=o, subsample=s,
+                                    verify_grad=False, mode=mode, device='cpu',
+                                    provide_shape=provide_shape, border_mode=b,
+                                    filter_flip=flip)
+            else:
+                self.assertRaises(NotImplementedError,
+                                  self.run_gradweight,
+                                  inputs_shape=i,
+                                  filters_shape=f,
+                                  output_shape=o,
+                                  subsample=s,
+                                  verify_grad=False,
+                                  mode=mode,
+                                  device='cpu',
+                                  provide_shape=provide_shape,
+                                  border_mode=b,
+                                  filter_flip=flip)
+            if gradinput_OK:
+                self.run_gradinput(inputs_shape=i, filters_shape=f,
+                                   output_shape=o, subsample=s,
+                                   verify_grad=False, mode=mode, device='cpu',
+                                   provide_shape=provide_shape, border_mode=b,
+                                   filter_flip=flip)
+            else:
+                self.assertRaises(NotImplementedError,
+                                  self.run_gradinput,
+                                  inputs_shape=i,
+                                  filters_shape=f,
+                                  output_shape=o,
+                                  subsample=s,
+                                  verify_grad=False,
+                                  mode=mode,
+                                  device='cpu',
+                                  provide_shape=provide_shape,
+                                  border_mode=b,
+                                  filter_flip=flip)
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -158,9 +158,9 @@ class Conv3D(theano.Op):
        vidDur = V_shape[3]
        filterDur = W_shape[3]
-        output_height = T.floor((vidHeight - filterHeight) // dr) + 1
+        output_height = ((vidHeight - filterHeight) // dr) + 1
-        output_width = T.floor((vidWidth - filterWidth) // dc) + 1
+        output_width = ((vidWidth - filterWidth) // dc) + 1
-        output_dur = T.floor((vidDur - filterDur) // dt) + 1
+        output_dur = ((vidDur - filterDur) // dt) + 1
        rval = (batch_size, output_height, output_width, output_dur, output_channels)

--- a/theano/tensor/nnet/abstract_conv2d.py
+++ b/theano/tensor/nnet/abstract_conv2d.py
+"""
+Define abstract conv2d interface
+"""
+import logging
+import theano
+from theano.tensor import (as_tensor_variable, patternbroadcast)
+from theano.tensor import TensorType
+from theano.gof import Apply, Op
+from theano.gof import local_optimizer
+from theano.tensor.opt import register_specialize_device
+# Cpu implementation
+from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
+from theano.tensor.nnet.ConvGrad3D import convGrad3D
+from theano.tensor.nnet.ConvTransp3D import convTransp3D
+__docformat__ = "restructuredtext en"
+_logger = logging.getLogger("theano.tensor.nnet.conv2d")
+def conv2d(input,
+           filters,
+           input_shape=None,
+           filter_shape=None,
+           border_mode='valid',
+           subsample=(1, 1),
+           filter_flip=True):
+    """
+    This function will build the symbolic graph for convolving a mini-batch of a
+    stack of 2D inputs with a set of 2D filters. The implementation is modelled
+    after Convolutional Neural Networks (CNN).
+    :type input: symbolic 4D tensor
+    :param input: mini-batch of feature map stacks, of shape
+        (batch size, input channels, input rows, input columns).
+        See the optional parameter ``input_shape``.
+    :type filters: symbolic 4D tensor
+    :param filters: set of filters used in CNN layer of shape
+        (output channels, input channels, filter rows, filter columns).
+        See the optional parameter ``filter_shape``.
+    :type input_shape: None, tuple/list of len 4 of int or Constant variable
+    :param input_shape: The shape of the input parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+    :type filter_shape: None, tuple/list of len 4 of int or Constant variable
+    :param filter_shape: The shape of the filters parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+    :type border_mode: str, int or tuple of two int
+    :param border_mode: Either of the following:
+        * ``'valid'``: apply filter wherever it completely overlaps with the
+          input. Generates output of shape: input shape - filter shape + 1
+        * ``'full'``: apply filter wherever it partly overlaps with the input.
+          Generates output of shape: input shape + filter shape - 1
+        * ``'half'``: pad input with a symmetric border of ``filter rows // 2``
+          rows and ``filter columns // 2`` columns, then perform a valid
+          convolution. For filters with an odd number of rows and columns, this
+          leads to the output shape being equal to the input shape.
+        * ``int``: pad input with a symmetric border of zeros of the given
+          width, then perform a valid convolution.
+        * ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
+          and ``int2`` columns, then perform a valid convolution.
+    :type subsample: tuple of len 2
+    :param subsample: factor by which to subsample the output.
+        Also called strides elsewhere.
+    :type filter_flip: bool
+    :param filter_flip: If ``True``, will flip the filter rows and columns
+        before sliding them over the input. This operation is normally referred
+        to as a convolution, and this is the default. If ``False``, the filters
+        are not flipped and the operation is referred to as a cross-correlation.
+    :rtype: symbolic 4D tensor
+    :return: set of feature maps generated by convolutional layer. Tensor is
+        of shape (batch size, output channels, output rows, output columns)
+    """
+    conv_op = AbstractConv2d(imshp=input_shape,
+                             kshp=filter_shape,
+                             border_mode=border_mode,
+                             subsample=subsample,
+                             filter_flip=filter_flip)
+    return conv_op(input, filters)
+class BaseAbstractConv2d(Op):
+    """
+    Base class for AbstractConv
+    Define an abstract convolution op that will be replaced with the appropriate implementation
+    :type imshp: None, tuple/list of len 4 of int or Constant variable
+    :param imshp: The shape of the input parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+        imshp is defined w.r.t the forward conv.
+    :type kshp: None, tuple/list of len 4 of int or Constant variable
+    :param kshp: The shape of the filters parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+        kshp is defined w.r.t the forward conv.
+    :type border_mode: str, int or tuple of two int
+    :param border_mode: Either of the following:
+        * ``'valid'``: apply filter wherever it completely overlaps with the
+          input. Generates output of shape: input shape - filter shape + 1
+        * ``'full'``: apply filter wherever it partly overlaps with the input.
+          Generates output of shape: input shape + filter shape - 1
+        * ``'half'``: pad input with a symmetric border of ``filter rows // 2``
+          rows and ``filter columns // 2`` columns, then perform a valid
+          convolution. For filters with an odd number of rows and columns, this
+          leads to the output shape being equal to the input shape.
+        * ``int``: pad input with a symmetric border of zeros of the given
+          width, then perform a valid convolution.
+        * ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
+          and ``int2`` columns, then perform a valid convolution.
+    :type subsample: tuple of len 2
+    :param subsample: factor by which to subsample the output.
+        Also called strides elsewhere.
+    :type filter_flip: bool
+    :param filter_flip: If ``True``, will flip the filter rows and columns
+        before sliding them over the input. This operation is normally referred
+        to as a convolution, and this is the default. If ``False``, the filters
+        are not flipped and the operation is referred to as a cross-correlation.
+    """
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_flip', 'imshp', 'kshp')
+    def __init__(self,
+                 imshp=None, kshp=None,
+                 border_mode="valid", subsample=(1, 1),
+                 filter_flip=True):
+        if isinstance(border_mode, int):
+            border_mode = (border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w = map(int, border_mode)
+            border_mode = (pad_h, pad_w)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a pair of'
+                ' integers'.format(border_mode))
+        self.imshp = imshp
+        self.kshp = kshp
+        self.border_mode = border_mode
+        self.filter_flip = filter_flip
+        if len(subsample) != 2:
+            raise ValueError("subsample must have two elements")
+        self.subsample = subsample
+    def flops(self, inp, outp):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        # if the output shape is correct, then this gives the correct
+        # flops for any direction, sampling, padding, and border mode
+        inputs, filters = inp
+        outputs, = outp
+        assert inputs[1] == filters[1]
+        # nb mul and add by output pixel
+        flops = filters[2] * filters[3] * 2
+        # nb flops by output image
+        flops *= outputs[2] * outputs[3]
+        # nb patch multiplied
+        flops *= inputs[1] * filters[0] * inputs[0]
+        return flops
+class AbstractConv2d(BaseAbstractConv2d):
+    """
+    Abstract Op for the forward convolution.
+    """
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 filter_flip=True):
+        super(AbstractConv2d, self).__init__(imshp, kshp,
+                                             border_mode, subsample, filter_flip)
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        broadcastable = [img.broadcastable[0],
+                         kern.broadcastable[0],
+                         False, False]
+        output = img.type.clone(broadcastable=broadcastable)()
+        return Apply(self, [img, kern], [output])
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('AbstractConv2d theano optimization failed')
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
+                                             self.border_mode,
+                                             self.subsample,
+                                             self.filter_flip)(
+            weights, top, bottom.shape[-2:])
+        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
+                                               self.border_mode,
+                                               self.subsample,
+                                               self.filter_flip)(
+            bottom, top, weights.shape[-2:])
+        return d_bottom, d_weights
+class AbstractConv2d_gradWeights(BaseAbstractConv2d):
+    """Gradient wrt. filters for `AbstractConv2d`.
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+    """
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 filter_flip=True):
+        super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp,
+                                                         border_mode, subsample, filter_flip)
+    # Update shape/height_width
+    def make_node(self, img, topgrad, shape):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        shape = as_tensor_variable(shape)
+        broadcastable = [topgrad.broadcastable[1],
+                         img.broadcastable[1],
+                         False, False]
+        output = img.type.clone(broadcastable=broadcastable)()
+        return Apply(self, [img, topgrad, shape], [output])
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
+                                             self.border_mode,
+                                             self.subsample,
+                                             self.filter_flip)(weights, top, bottom.shape[-2:])
+        d_top = AbstractConv2d(self.imshp,
+                               self.kshp,
+                               self.border_mode,
+                               self.subsample,
+                               self.filter_flip)(bottom, weights)
+        d_height_width = (theano.gradient.DisconnectedType()(),)
+        return (d_bottom, d_top) + d_height_width
+    def connection_pattern(self, node):
+        return [[1], [1], [0]]  # no connection to height, width
+class AbstractConv2d_gradInputs(BaseAbstractConv2d):
+    """Gradient wrt. inputs for `AbstractConv2d`.
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+    """
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 filter_flip=True):
+        super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp,
+                                                        border_mode, subsample, filter_flip)
+    # Update shape/height_width
+    def make_node(self, kern, topgrad, shape):
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        shape = as_tensor_variable(shape)
+        broadcastable = [topgrad.type.broadcastable[0],
+                         kern.type.broadcastable[1],
+                         False, False]
+        output = kern.type.clone(broadcastable=broadcastable)()
+        return Apply(self, [kern, topgrad, shape], [output])
+    def perform(self, node, inp, out_):
+        raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
+                                               self.border_mode,
+                                               self.subsample)(bottom, top, weights.shape[-2:])
+        d_top = AbstractConv2d(self.imshp, self.kshp,
+                               self.border_mode, self.subsample)(bottom, weights)
+        d_height_width = (theano.gradient.DisconnectedType()(),)
+        return (d_weights, d_top) + d_height_width
+    def connection_pattern(self, node):
+        return [[1], [1], [0]]  # no connection to height, width
+# Cpu Optmization
+@local_optimizer([AbstractConv2d])
+def local_conv2d_cpu(node):
+    if not isinstance(node.op, AbstractConv2d):
+        return None
+    img, kern = node.inputs
+    if ((not isinstance(img.type, TensorType) or
+         not isinstance(kern.type, TensorType))):
+        return None
+    if node.op.border_mode not in ['full', 'valid']:
+        return None
+    if not node.op.filter_flip:
+        # Not tested yet
+        return None
+    rval = cpu_conv2d(img, kern,
+                      node.op.imshp, node.op.kshp,
+                      border_mode=node.op.border_mode,
+                      subsample=node.op.subsample)
+    return [rval]
+register_specialize_device(local_conv2d_cpu, 'fast_compile')
+@local_optimizer([AbstractConv2d_gradWeights])
+def local_conv2d_gradweight_cpu(node):
+    img, topgrad, shape = node.inputs
+    if ((not isinstance(img.type, TensorType) or
+         not isinstance(topgrad.type, TensorType))):
+        return None
+    if node.op.border_mode not in ['full', 'valid']:
+        return None
+    if not node.op.filter_flip:
+        # Not tested yet
+        return
+    if node.op.border_mode == 'valid' and \
+            (node.op.subsample != (1, 1)):
+        # Use the gradient as defined in conv3D, because the implementation
+        # by Conv is slow (about 3x slower than conv3D, and probably 10x
+        # slower than it could be), nad incorrect when subsample > 2.
+        # build a "node", that should be equivalent to the one given by
+        # self.make_node, but using convGrad3D instead.
+        shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
+        shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
+        rval = convGrad3D(V=shuffled_img,
+                          d=(node.op.subsample[0], node.op.subsample[1], 1),
+                          WShape=(shuffled_topgrad.shape[4],
+                                  shape[0], shape[1], 1,
+                                  shuffled_img.shape[4]),
+                          dCdH=shuffled_topgrad)
+        rval = theano.tensor.addbroadcast(rval, 3)
+        rval = rval.dimshuffle(0, 4, 1, 2)
+        rval = rval[:, :, ::-1, ::-1]
+        rval = patternbroadcast(rval, node.outputs[0].broadcastable)
+        return [rval]
+    dx, dy = node.op.subsample
+    if dx not in (1, 2) or dy not in (1, 2):
+        # Not implemented in the gradient of ConvOp
+        return None
+    if node.op.imshp is None:
+        op_imshp = (None, None, None, None)
+    else:
+        op_imshp = node.op.imshp
+    if node.op.kshp is None:
+        op_kshp = (None, None, None, None)
+    else:
+        op_kshp = node.op.kshp
+    if None in op_imshp or None in op_kshp:
+        if (dx, dy) != (1, 1):
+            # We cannot infer the shapes
+            return None
+    # Determine gradient on kernels
+    assert len(op_imshp) == 4 and len(op_kshp) == 4
+    outshp = ConvOp.getOutputShape(op_imshp[2:],
+                                   op_kshp[2:], node.op.subsample,
+                                   node.op.border_mode)
+    fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
+                                       op_kshp[2:], (1, 1),
+                                       node.op.border_mode)
+    newimg = img.dimshuffle((1, 0, 2, 3))
+    newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
+    if node.op.border_mode == 'valid':
+        (img, filters) = (newimg, newtopgrad)
+        kshp_logical = fulloutshp
+        kshp_logical_top_aligned = False
+        imshp_logical = None
+        (bsize, nkern) = (op_imshp[1], op_kshp[0])
+        imshp = (op_imshp[0], op_imshp[2], op_imshp[3])
+        kshp = outshp
+    elif node.op.border_mode == 'full':
+        (img, filters) = (newtopgrad, newimg)
+        kshp_logical = None
+        kshp_logical_top_aligned = True
+        imshp_logical = (op_imshp[0],
+                         fulloutshp[0],
+                         fulloutshp[1])
+        (bsize, nkern) = (op_kshp[0], op_imshp[1])
+        imshp = (op_imshp[0], outshp[0], outshp[1])
+        kshp = op_imshp[2:]
+    else:
+        raise NotImplementedError(
+            'Only [full,valid] modes are currently supported.')
+    # Flip the kernels
+    filters = filters[:, :, ::-1, ::-1]
+    dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
+                unroll_batch=None, unroll_kern=None, unroll_patch=None,
+                imshp_logical=imshp_logical,
+                kshp_logical=kshp_logical,
+                kshp_logical_top_aligned=kshp_logical_top_aligned,
+                direction_hint='bprop weights')
+    res = dw(img, filters)
+    if node.op.border_mode == 'valid':
+        res = res.dimshuffle((1, 0, 2, 3))
+        res = res[:, :, ::-1, ::-1]
+    res = patternbroadcast(res, node.outputs[0].broadcastable)
+    return [res]
+register_specialize_device(local_conv2d_gradweight_cpu, 'fast_compile')
+@local_optimizer([AbstractConv2d_gradInputs])
+def local_conv2d_gradinputs_cpu(node):
+    kern, topgrad, shape = node.inputs
+    if ((not isinstance(kern.type, TensorType) or
+         not isinstance(topgrad.type, TensorType))):
+        return None
+    if node.op.border_mode not in ['full', 'valid']:
+        return None
+    if not node.op.filter_flip:
+        # Not tested yet
+        return None
+    # Conv 3d implementation, needed when subsample > 2
+    if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
+        kern = kern[:, :, ::-1, ::-1]
+        shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
+        shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
+        b = theano.tensor.zeros_like(shuffled_kern[0, 0, 0, 0, :])
+        rval = convTransp3D(W=shuffled_kern, b=b,
+                            d=(node.op.subsample[0], node.op.subsample[1], 1),
+                            H=shuffled_topgrad,
+                            RShape=(shape[0], shape[1], 1))
+        rval = theano.tensor.addbroadcast(rval, 3)
+        rval = rval.dimshuffle(0, 4, 1, 2)
+        rval = patternbroadcast(rval, node.outputs[0].broadcastable)
+        return [rval]
+    # Conv2d Implementation
+    dx, dy = node.op.subsample
+    if dx not in (1, 2) or dy not in (1, 2):
+        # Not implemented in the gradient of ConvOp
+        return None
+    if node.op.imshp is None:
+        op_imshp = (None, None, None, None)
+    else:
+        op_imshp = node.op.imshp
+    if node.op.kshp is None:
+        op_kshp = (None, None, None, None)
+    else:
+        op_kshp = node.op.kshp
+    if None in op_imshp or None in op_kshp:
+        if (dx, dy) != (1, 1):
+            return None
+    mode = 'valid'
+    if not node.op.border_mode == 'full':
+        mode = 'full'
+    filters = kern.dimshuffle((1, 0, 2, 3))
+    filters = filters[:, :, ::-1, ::-1]
+    outshp = ConvOp.getOutputShape(op_imshp[2:],
+                                   op_kshp[2:], node.op.subsample,
+                                   node.op.border_mode)
+    fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
+                                       op_kshp[2:], (1, 1),
+                                       node.op.border_mode)
+    nkern = op_imshp[1]
+    imshp = (op_kshp[0], outshp[0], outshp[1])
+    imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
+    din = ConvOp(imshp,
+                 op_kshp[2:],
+                 nkern,
+                 op_imshp[0],
+                 1, 1, output_mode=mode,
+                 unroll_batch=None, unroll_kern=None,
+                 unroll_patch=None,
+                 imshp_logical=imshp_logical,
+                 kshp_logical=None,
+                 version=-1,
+                 direction_hint='bprop inputs')
+    din = din(topgrad, filters)
+    din = patternbroadcast(din, node.outputs[0].broadcastable)
+    return [din]
+register_specialize_device(local_conv2d_gradinputs_cpu, 'fast_compile')