new interface, work in progress

2451318a · Nicolas Ballas · Pascal Lamblin · 662ea98e · 2451318a
--- a/theano/tensor/nnet/conv2d.py
+++ b/theano/tensor/nnet/conv2d.py
+"""
+FIXME
+"""
+
+__docformat__ = "restructuredtext en"
+
+import logging
+
+import numpy
+
+import theano
+from theano.tensor import (as_tensor_variable, blas, get_scalar_constant_value,
+                           patternbroadcast, NotScalarConstantError)
+from theano.gof import Apply, Op
+from theano.gof import local_optimizer
+from theano.sandbox.cuda.basic_ops import (
+    gpu_contiguous, gpu_from_host, host_from_gpu
+    )
+from theano.sandbox.cuda import gpu_optimizer, register_opt
+
+imported_scipy_signal = False
+try:
+    # TODO: move these back out to global scope when they no longer
+    # cause an atexit error
+    from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
+    from scipy.signal.sigtools import _convolve2d
+    imported_scipy_signal = True
+except ImportError:
+    pass
+
+_logger = logging.getLogger("theano.tensor.nnet.conv")
+
+
+def conv2d(img,
+           filters,
+           input_shape=None,
+           filter_shape=None,
+           batch_size=None,
+           border_mode='valid',
+           subsample=(1, 1),
+           filter_flip=False):
+    """
+    This function will build the symbolic graph for convolving a mini-batch of a
+    stack of 2D inputs with a set of 2D filters. The implementation is modelled
+    after Convolutional Neural Networks (CNN).
+
+    :type input: symbolic 4D tensor
+    :param input: mini-batch of feature map stacks, of shape
+        (batch size, input channels, input rows, input columns).
+        See the optional parameter ``input_shape``.
+
+    :type filters: symbolic 4D tensor
+    :param filters: set of filters used in CNN layer of shape
+        (output channels, input channels, filter rows, filter columns).
+        See the optional parameter ``filter_shape``.
+
+    :type input_shape: None, tuple/list of len 4 of int or Constant variable
+    :param input_shape: The shape of the input parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+
+    :type filter_shape: None, tuple/list of len 4 of int or Constant variable
+    :param filter_shape: The shape of the filters parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+
+    :type border_mode: str, int or tuple of two int
+    :param border_mode: Either of the following:
+        * ``'valid'``: apply filter wherever it completely overlaps with the
+          input. Generates output of shape: input shape - filter shape + 1
+        * ``'full'``: apply filter wherever it partly overlaps with the input.
+          Generates output of shape: input shape + filter shape - 1
+        * ``'half'``: pad input with a symmetric border of ``filter rows // 2``
+          rows and ``filter columns // 2`` columns, then perform a valid
+          convolution. For filters with an odd number of rows and columns, this
+          leads to the output shape being equal to the input shape.
+        * ``int``: pad input with a symmetric border of zeros of the given
+          width, then perform a valid convolution.
+        * ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
+          and ``int2`` columns, then perform a valid convolution.
+
+    :type subsample: tuple of len 2
+    :param subsample: factor by which to subsample the output.
+        Also called strides elsewhere.
+
+    :type filter_flip: bool
+    :param filter_flip: If ``True``, will flip the filter rows and columns
+        before sliding them over the input. This operation is normally referred
+        to as a convolution, and this is the default. If ``False``, the filters
+        are not flipped and the operation is referred to as a cross-correlation.
+
+    :rtype: symbolic 4D tensor
+    :return: set of feature maps generated by convolutional layer. Tensor is
+        of shape (batch size, output channels, output rows, output columns)
+    """
+
+    if (filter_flip):
+        filters = filters[:, :, ::-1, ::-1]
+    ### FIXME input shape/kernel shape
+    conv_op = Conv2d(imshp=image_shape, kshp=filter_shape, bsize=batch_size,
+                     border_mode="valid", subsample=(1, 1), pad=(0, 0))
+    return conv_op(img, filters)
+
+
+
+class BaseConv2d(Op):
+    """Base class for ConvInferace
+
+    FIXME
+    """
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample')
+
+    def __init__(self,
+                 imshp=None, kshp=None, bsize=None,
+                 border_mode="valid", subsample=(1, 1)):
+        if isinstance(border_mode, int):
+            border_mode = (border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w = map(int, border_mode)
+            border_mode = (pad_h, pad_w)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a pair of'
+                ' integers'.format(border_mode))
+
+        self.imshp = imshp
+        self.kshp = kshp,
+        self.bsize = bsize
+        self.border_mode = border_mode
+        if len(subsample) != 2:
+            raise ValueError("subsample must have two elements")
+        self.subsample = subsample
+        self.on_gpu = False
+
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0)
+
+    def __str__(self):
+        return '%s{%s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample))
+
+    def flops(self, inp, outp):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        # if the output shape is correct, then this gives the correct
+        # flops for any direction, sampling, padding, and border mode
+        inputs, filters = inp
+        outputs, = outp
+        assert inputs[1] == filters[1]
+        # nb mul and add by output pixel
+        flops = filters[2] * filters[3] * 2
+        # nb flops by output image
+        flops *= outputs[2] * outputs[3]
+        # nb patch multiplied
+        flops *= inputs[1] * filters[0] * inputs[0]
+        return flops
+
+
+
+class Conv2d(BaseConv2d):
+    """
+    FIXME
+    """
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 bsize=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 pad=(0, 0)):
+        super(Conv2d, self).__init__(imshp, kshp, bsize,
+                                     border_mode, subsample, pad)
+
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False]
+        return Apply(self, [img, kern], [broadcastable()])
+
+    def perform(self, node, nodename, inp, out_, sub):
+        raise NotImplementedError('Conv2d theano optimization failed')
+
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        d_bottom = Conv2d_gradInputs(self.imshp, self.kshp, self.bsize,
+                                     self.border_mode, self.subsample)(
+            weights, top, bottom.shape[-2:])
+        d_weights = Conv2d_gradWeights(self.imshp, self.kshp, self.bsize,
+                                       self.border_mode, self.subsample)(
+            bottom, top, weights.shape[-2:])
+        return d_bottom, d_weights
+
+
+class Conv2d_gradWeights(BaseConv2d):
+    """Gradient wrt. filters for `Conv2d`.
+
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+
+    """
+
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 bsize=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 pad=(0, 0)):
+        super(Conv2d_gradWeights, self).__init__(imshp, kshp, bsize,
+                                                 border_mode, subsample, pad)
+
+    def make_node(self, img, topgrad, shape=None):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if self.subsample != (1, 1) or self.border_mode == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1)'
+                                 ' or border_mode == "half"')
+            height_width = [shape[0], shape[1]]
+        else:
+            height_width = []
+
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [img, topgrad] + height_width, [broadcastable()])
+
+    def perform(self, node, nodename, inp, out_, sub):
+        raise NotImplementedError('Conv2d_gradWeight theano optimization failed')
+
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        d_bottom = Conv2d_gradInputs(self.imshp, self.kshp, self.bsize,
+                                     self.border_mode, self.subsample)(
+            weights, top, bottom.shape[-2:])
+        d_top = Conv2d(self.imshp, self.kshp, self.bsize,
+                       self.border_mode, self.subsample)(bottom, weights)
+        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
+        return (d_bottom, d_top) + d_height_width
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0]]  # no connection to height, width
+
+
+class Conv2d_gradInputs(Conv2d):
+    """Gradient wrt. inputs for `Conv2d`.
+
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+
+    """
+
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 bsize=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 pad=(0, 0)):
+        super(Conv2d_gradInputs, self).__init__(imshp, kshp, bsize,
+                                                border_mode, subsample, pad)
+
+    def make_node(self, kern, topgrad, shape=None):
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if self.subsample != (1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1)')
+        height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
+
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [kern, topgrad] + height_width, [broadcastable()])
+
+    def perform(self, node, nodename, inp, out_, sub):
+        raise NotImplementedError('Conv2d_gradWeight theano optimization failed')
+
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        d_weights = Conv2d_gradWeights(self.imshp, self.kshp, self.bsize,
+                                       self.border_mode, self.subsample)(
+            bottom, top, weights.shape[-2:])
+        d_top = Conv2d(self.imshp, self.filter_shape, self.bsize,
+                       self.border_mode, self.subsample)(bottom, weights)
+        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
+        return (d_weights, d_top) + d_height_width
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0]]  # no connection to height, width
+
+
+
+
+### to Gpu optimization
+@local_optimizer([gpu_from_host, Conv2d, Conv2d_gradWeights, Conv2d_gradInputs])
+def local_conv2d_gpu_conv(node, convop):
+    """
+    gpu_from_host(Conv) -> (gpu)_Conv(gpu_from_host)
+
+    Conv(host_from_gpu) -> host_from_gpu((gpu)_Conv)
+    """
+    if isinstance(node.op, GpuFromHost):
+        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
+        host_input = node.inputs[0]
+        if host_input.owner and \
+                (isinstance(host_input.owner.op, Conv2d) or
+                 isinstance(host_input.owner.op, Conv2d_gradWeights) or
+                 isinstance(host_input.owner.op, Conv2d_gradInputs)):
+            gpu_conv = host_input.owner.op
+            gpu_conv.on_gpu = True
+            img, kern = host_input.owner.inputs
+            out = gpu_conv(gpu_from_host(img),
+                           gpu_from_host(kern))
+            out = tensor.patternbroadcast(gpu_from_host(out),
+                                          node.outputs[0].broadcastable)
+            #out.values_eq_approx = values_eq_approx_high_tol
+            return [out]
+
+    if (isinstance(node.op, Conv2d) or
+        isinstance(node.op, Conv2d_gradWeights) or
+        isinstance(node.op, Conv2d_gradInputs)):
+        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+        img, kern = node.inputs
+        img_on_gpu = (img.owner and isinstance(img.owner.op, HostFromGpu))
+        kern_on_gpu = (kern.owner and isinstance(kern.owner.op, HostFromGpu))
+        if img_on_gpu or kern_on_gpu:
+            gpu_conv = node.op
+            gpu_conv.on_gpu = True
+            out = gpu_conv(gpu_from_host(img),
+                           gpu_from_host(kern))
+            out = tensor.patternbroadcast(
+                out,
+                node.outputs[0].broadcastable)
+            #out.values_eq_approx = values_eq_approx_high_tol
+            return [out]
+
+# We register the optimizer that moves convolutions to the GPU.
+register_opt()(local_conv2d_gpu_conv)
+
+#### GPU DNN optimization
+@local_optimizer([Conv2d, Conv2d_gradWeights, Conv2d_gradInputs])
+def local_conv2d_dnn(node):
+    if not dnn_available():
+        return
+    if border_mode not in ['full', 'valid']:
+        return
+
+    if (isinstance(node.op, Conv2d) and node.op.on_gpu):
+        img, kern = node.inputs
+        rval = dnn_conv(img, kern,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='forward')
+        return [rval]
+    if (isinstance(node.op, Conv2d_gradWeights) and node.op.on_gpu):
+        img, kern = node.inputs
+        rval = dnn_conv(img, kern,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='bprop weights')
+        return [rval]
+    if (isinstance(node.op, Conv2d_gradInputs) and node.op.on_gpu):
+        img, kern = node.inputs
+        rval = dnn_conv(img, kern,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='bprop inputs')
+        return [rval]
+
+register_opt()(local_conv2d_dnn)
+
+#### GPU CorrMM optimization
+@local_optimizer([Conv2d])
+def local_conv2d_gemm(node):
+    if (isinstance(node.op, Conv2d) and
+        node.on_gpu and
+        node.op.border_mode in ['full', 'valid']):
+        img, kern = node.inputs
+        border_mode = node.op.border_mode
+        subsample = node.op.subsample
+        if (border_mode == 'valid') or (subsample != (1,1)):
+            # need to flip the kernel for valid convolution
+            kern = kern[:, :, ::-1, ::-1]
+            # By default use GpuCorrMM
+            rval = GpuCorrMM(border_mode, subsample)(
+                gpu_contiguous(img), gpu_contiguous(kern))
+
+            # call GpuCorrMM_gradWeights if good
+            # (the latter is faster if batchsize * kernelHeight * kernelWidth
+            # is larger than inputChannels * outputHeight * outputWidth.
+            # GpuConv does not always store information on the batchsize and
+            # channels, though, so we only use what information we have.)
+            if ((subsample == (1,1)) and
+                (node.op.imshp is not None) and
+                (None not in node.op.imshp[-2:]) and
+                (node.op.kshp is not None) and
+                (None not in node.op.kshp)):
+                # we know the kernel and output size
+                prod1 = node.op.kshp[0] * node.op.kshp[1]
+                prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                         (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                if ((node.op.bsize is not None) and
+                        (len(node.op.imshp) == 3) and
+                        (node.op.imshp[0] is not None)):
+                    # we also know batchsize and input channels
+                    prod1 *= node.op.bsize
+                    prod2 *= node.op.imshp[0]
+                # compare to decide
+                if prod1 > prod2:
+                    # (we need to wrap the result in as_cuda_ndarray_variable,
+                    # because we are not allowed to replace a CudaNdarray with
+                    # a DimShuffle instance in a graph optimization)
+                    rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
+                        GpuCorrMM_gradWeights(border_mode, subsample)(
+                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
+                        ).dimshuffle(1, 0, 2, 3))
+        elif (border_mode == 'full'):
+            # need to dimshuffle the kernel for full convolution
+            kern = kern.dimshuffle(1, 0, 2, 3)
+            # call GpuCorrMM_gradInputs
+            rval = GpuCorrMM_gradInputs('valid', subsample)(
+                    gpu_contiguous(kern), gpu_contiguous(img))
+        if node.outputs[0].broadcastable != rval.broadcastable:
+            # With given shape information, conv2d_fft may return a different
+            # broadcast pattern than GpuConv. This is forbidden, so we fix it.
+            rval = tensor.patternbroadcast(
+                rval, node.outputs[0].type.broadcastable)
+        return [rval]
+
+@local_optimizer([Conv2d_gradWeights])
+def local_conv2d_gradweight_gemm(node):
+    if isinstance(node.op, Conv2d) and node.on_gpu:
+        rval = GpuCorrMM_gradWeight(border_mode=node.op.border_mode,
+                                    subsample=node.op.subsample)(
+            gpu_contiguous(img), gpu_contiguous(kern))
+        return [rval]
+
+@local_optimizer([Conv2d_gradInputs])
+def local_conv2d_gradinputs_gemm(node):
+    if isinstance(node.op, Conv2d) and node.on_gpu:
+        rval =  GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
+                                     subsample=node.op.subsample)(
+            gpu_contiguous(img), gpu_contiguous(kern))
+        return [rval]
+
+# First we register the optimizer that moves convolutions to the GPU.
+
+### Cpu Optmization
+
+@local_optimizer([Conv2d_gradWeights])
+def local_conv2d_cpu(node):
+    if isinstance(node.op, Conv2d) and node.on_gpu:
+        rval = GpuCorrMM_gradWeight(border_mode=node.op.border_mode,
+                                    subsample=node.op.subsample)(
+            gpu_contiguous(img), gpu_contiguous(kern))
+        return [rval]
+
+
+@local_optimizer([Conv2d_gradWeights])
+def local_conv2d_gradweight_cpu(node):
+    if isinstance(node.op, Conv2d) and node.on_gpu:
+        rval = GpuCorrMM_gradWeight(border_mode=node.op.border_mode,
+                                    subsample=node.op.subsample)(
+            gpu_contiguous(img), gpu_contiguous(kern))
+        return [rval]
+
+@local_optimizer([Conv2d_gradInputs])
+def local_conv2d_gradinputs_cpu(node):
+    if isinstance(node.op, Conv2d) and node.on_gpu:
+        rval =  GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
+                                     subsample=node.op.subsample)(
+            gpu_contiguous(img), gpu_contiguous(kern))
+        return [rval]