Merge pull request #3737 from lamblin/gpuarray_abstractconv

Gpuarray abstractconv

Merge pull request #3737 from lamblin/gpuarray_abstractconv
6d4633be · Frédéric Bastien · 58cd0ac8 · ec381983 · 6d4633be · 6d4633be
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -2406,14 +2406,14 @@ if True:
 @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
                  AbstractConv2d_gradInputs])
 def local_abstractconv_cudnn(node):
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-
    if (not isinstance(node.op, (AbstractConv2d,
                                 AbstractConv2d_gradWeights,
                                 AbstractConv2d_gradInputs))):
        return None

+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+
    if (not isinstance(inp1.type, CudaNdarrayType) or
            not isinstance(inp2.type, CudaNdarrayType)):
        return None

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase):
                               verify_grad=True, mode=mode, device='gpu',
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip)
-
-    def test_cormm_conv(self):
-        if not dnn_available():
-            raise SkipTest(cuda.dnn.dnn_available.msg)
-
-        mode = mode_without_gpu
-        for (i, f), s, b, flip, provide_shape in itertools.product(
-                zip(self.inputs_shapes, self.filters_shapes),
-                self.subsamples,
-                self.border_modes,
-                self.filter_flip,
-                [False, True]):
-
-            o = self.get_output_shape(i, f, s, b)
-            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                         verify_grad=True, mode=mode, device='cpu',
-                         provide_shape=provide_shape, border_mode=b,
-                         filter_flip=flip)
-            self.run_gradweight(inputs_shape=i, filters_shape=f,
-                                output_shape=o, subsample=s,
-                                verify_grad=True, mode=mode, device='cpu',
-                                provide_shape=provide_shape, border_mode=b,
-                                filter_flip=flip)
-            self.run_gradinput(inputs_shape=i, filters_shape=f,
-                               output_shape=o, subsample=s,
-                               verify_grad=True, mode=mode, device='cpu',
-                               provide_shape=provide_shape, border_mode=b,
-                               filter_flip=flip)
-
-    def test_cpu_conv(self):
-        if not dnn_available():
-            raise SkipTest(cuda.dnn.dnn_available.msg)
-
-        mode = mode_without_gpu.excluding('conv_gemm')
-        for (i, f), s, b, flip, provide_shape in itertools.product(
-                zip(self.inputs_shapes, self.filters_shapes),
-                self.subsamples,
-                self.border_modes,
-                self.filter_flip,
-                [False, True]):
-
-            o = self.get_output_shape(i, f, s, b)
-            fwd_OK = True
-            gradweight_OK = True
-            gradinput_OK = True
-
-            if not flip:
-                fwd_OK = False
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if b not in ('valid', 'full'):
-                fwd_OK = False
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if fwd_OK:
-                self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                             verify_grad=True, mode=mode, device='cpu',
-                             provide_shape=provide_shape, border_mode=b,
-                             filter_flip=flip)
-            else:
-                self.assertRaises(NotImplementedError,
-                                  self.run_fwd,
-                                  inputs_shape=i,
-                                  filters_shape=f,
-                                  subsample=s,
-                                  verify_grad=False,
-                                  mode=mode,
-                                  device='cpu',
-                                  provide_shape=provide_shape,
-                                  border_mode=b,
-                                  filter_flip=flip)
-
-            if gradweight_OK:
-                self.run_gradweight(inputs_shape=i, filters_shape=f,
-                                    output_shape=o, subsample=s,
-                                    verify_grad=False, mode=mode, device='cpu',
-                                    provide_shape=provide_shape, border_mode=b,
-                                    filter_flip=flip)
-            else:
-                self.assertRaises(NotImplementedError,
-                                  self.run_gradweight,
-                                  inputs_shape=i,
-                                  filters_shape=f,
-                                  output_shape=o,
-                                  subsample=s,
-                                  verify_grad=False,
-                                  mode=mode,
-                                  device='cpu',
-                                  provide_shape=provide_shape,
-                                  border_mode=b,
-                                  filter_flip=flip)
-
-            if gradinput_OK:
-                self.run_gradinput(inputs_shape=i, filters_shape=f,
-                                   output_shape=o, subsample=s,
-                                   verify_grad=False, mode=mode, device='cpu',
-                                   provide_shape=provide_shape, border_mode=b,
-                                   filter_flip=flip)
-            else:
-                self.assertRaises(NotImplementedError,
-                                  self.run_gradinput,
-                                  inputs_shape=i,
-                                  filters_shape=f,
-                                  output_shape=o,
-                                  subsample=s,
-                                  verify_grad=False,
-                                  mode=mode,
-                                  device='cpu',
-                                  provide_shape=provide_shape,
-                                  border_mode=b,
-                                  filter_flip=flip)
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -5,6 +5,7 @@ import warnings
 import theano
 from theano import Op, Apply, tensor, config, Variable
 from theano.scalar import as_scalar, constant, Log
+from theano.tensor import as_tensor_variable
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.cmodule import GCC_compiler
@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.tensor.nnet import SoftmaxGrad
-from theano.tensor.nnet.abstract_conv import get_conv_output_shape
-from theano.tensor.signal.downsample import (
-    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
+                                              AbstractConv2d_gradWeights,
+                                              AbstractConv2d_gradInputs,
+                                              get_conv_output_shape)
+from theano.tensor.signal.downsample import (DownsampleFactorMax,
+                                             MaxPoolGrad, AveragePoolGrad)

 from . import pygpu
-from .type import get_context, gpu_context_type, list_contexts
+from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, HostFromGpu,
                        GpuAllocEmpty, empty_like)
 from .elemwise import GpuElemwise
-from .conv import GpuConv

 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


+def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
+                   subsample=(1, 1), conv_mode='conv'):
+    ctx_name = infer_context_name(img, topgrad)
+    img = gpu_contiguous(img)
+    topgrad = gpu_contiguous(topgrad)
+    kerns_shp = as_tensor_variable(kerns_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(kerns_shp)
+    out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
+    return GpuDnnConvGradW()(img, topgrad, out, desc)
+
+
+def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
+                  subsample=(1, 1), conv_mode='conv'):
+    ctx_name = infer_context_name(kerns, topgrad)
+    kerns = gpu_contiguous(kerns)
+    topgrad = gpu_contiguous(topgrad)
+    img_shp = as_tensor_variable(img_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(kerns.shape)
+    out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
+    return GpuDnnConvGradI()(kerns, topgrad, out, desc)
+
+
 class GpuDnnPoolDesc(Op):
    """
    This Op builds a pooling descriptor for use in the other
@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])


-# @register_opt('cudnn')  # this optimizer is registered in opt.py instead.
-@local_optimizer([GpuConv])
-def local_conv_dnn(node):
-    if isinstance(node.op, GpuConv):
-        if not dnn_available(node.outputs[0].type.context_name):
-            return
-        if node.op.border_mode not in ['full', 'valid']:
-            return
-        img, kern = node.inputs
-        border_mode = node.op.border_mode
-        subsample = node.op.subsample
-        direction_hint = node.op.direction_hint
-        rval = dnn_conv(img, kern,
-                        border_mode=border_mode, subsample=subsample,
-                        direction_hint=direction_hint)
-        return [rval]
-
-
-# This optimizer is registered in opt.py as part of the meta-optimizer.
-# It tries exactly the opposite code path of what local_conv_dnn() uses,
-# because for some input/kernel shape configurations, this is faster.
-@local_optimizer([GpuConv])
-def local_conv_dnn_alternative(node):
-    if isinstance(node.op, GpuConv):
-        if not dnn_available(node.outputs[0].type.context_name):
-            return
-        border_mode = node.op.border_mode
-        subsample = node.op.subsample
-        if border_mode not in ['full', 'valid'] or subsample != (1, 1):
-            return
-        img, kern = node.inputs
-        direction_hint = node.op.direction_hint
-        if border_mode == 'full':
-            # for a full convolution, try using the forward pass instead
-            # of the backward pass wrt. inputs
-            direction_hint = 'forward!'
-        elif border_mode == 'valid':
-            # for a valid convolution, try using the backward pass wrt.
-            # weights instead of the forward pass and vice versa
-            if direction_hint == 'bprop weights':
-                direction_hint = 'forward'
-            else:
-                direction_hint = 'bprop weights'
-        rval = dnn_conv(img, kern,
-                        border_mode=border_mode, subsample=subsample,
-                        direction_hint=direction_hint)
-        return [rval]
+@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_abstractconv_cudnn(node):
+    if (not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
+                                 AbstractConv2d_gradInputs))):
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]

+    if (not isinstance(inp1.type, GpuArrayType) or
+            not isinstance(inp2.type, GpuArrayType)):
+        return None
+
+    if not dnn_available(inp1.type.context_name):
+        return None

-conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
-                       'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
+    if node.op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+
+    if isinstance(node.op, AbstractConv2d):
+        rval = dnn_conv(inp1, inp2,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='forward!',
+                        conv_mode=conv_mode)
+    if isinstance(node.op, AbstractConv2d_gradWeights):
+        shape = (inp2.shape[1], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradweight(inp1, inp2, shape,
+                              border_mode=node.op.border_mode,
+                              subsample=node.op.subsample,
+                              conv_mode=conv_mode)
+    if isinstance(node.op, AbstractConv2d_gradInputs):
+        shape = (inp2.shape[0], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradinput(inp1, inp2, shape,
+                             border_mode=node.op.border_mode,
+                             subsample=node.op.subsample,
+                             conv_mode=conv_mode)
+    return [rval]
+
+conv_groupopt.register('local_abstractconv_cudnn',
+                       local_abstractconv_cudnn, 20,
+                       'fast_compile', 'fast_run',
+                       'gpuarray', 'conv_dnn', 'cudnn')


 @inplace_allocempty(GpuDnnConv, 2)

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB
 from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt

+from theano.tensor import as_tensor_variable
 from theano.tensor.nnet.conv import ConvOp
+from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
+                                              AbstractConv2d,
+                                              AbstractConv2d_gradWeights,
+                                              AbstractConv2d_gradInputs)
+
 from theano.tests.breakpoint import PdbBreakpoint

 from .type import (GpuArrayType, GpuArrayConstant, get_context,
@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
                   gpugemm_no_inplace)
-from .conv import GpuConv
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
@@ -786,77 +791,49 @@ def local_assert(node, context_name):

 @register_opt('fast_compile')
 @op_lifter([ConvOp])
-def local_gpu_conv(node, context_name):
-    def GpuConvOp_from_ConvOp(op):
-        logical_img_hw = None
-
-        if op.kshp_logical is not None and op.kshp_logical != op.kshp:
-            return None
-
-        ret = GpuConv(border_mode=op.out_mode,
-                      subsample=(op.dx, op.dy),
-                      logical_img_hw=logical_img_hw,
-                      logical_kern_hw=op.kshp_logical,
-                      logical_kern_align_top=op.kshp_logical_top_aligned,
-                      kshp=op.kshp,
-                      version=op.version,
-                      direction_hint=op.direction_hint,
-                      verbose=op.verbose,
-                      imshp=op.imshp,
-                      nkern=op.nkern,
-                      bsize=op.bsize,
-                      fft_opt=op.fft_opt)
-        if op.imshp_logical is not None:
-            logical_img_hw = op.imshp_logical[1:3]
-            if logical_img_hw != op.imshp[1:3]:
-                rstride = int(numpy.ceil(op.imshp_logical[1] /
-                                         float(op.imshp[1])))
-                cstride = int(numpy.ceil(op.imshp_logical[2] /
-                                         float(op.imshp[2])))
-
-                def make_graph(img, kern):
-                    buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
-                                       img.shape[0], *op.imshp_logical)
-                    img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
-                                               img)
-                    img = GpuFromHost(context_name)(img)
-                    return ret(img, kern)
-
-                return make_graph
-        return ret
-
-    def values_eq_approx(a, b):
-        """
-        This fct is needed to don't have DebugMode raise useless
-        error due to ronding error.
-
-        This happen as We reduce on the two last dimensions, so this
-        can raise the absolute error if the number of element we
-        reduce on is significant.
-
-        """
-        assert a.ndim == 4
-        atol = None
-        if a.shape[-1] * a.shape[-2] > 100:
-            # For float32 the default atol is 1e-5
-            atol = 3e-5
-        return GpuArrayType.values_eq_approx(a, b, atol=atol)
-
-    img, kern = node.inputs
-    gpu_conv = GpuConvOp_from_ConvOp(node.op)
-    if gpu_conv is None:
-        return
-    out = gpu_conv(GpuFromHost(context_name)(img),
-                   GpuFromHost(context_name)(kern))
-    assert isinstance(out.type, GpuArrayType)
-    # Make sure to keep the broadcastable pattern of the original
-    # convolution even if we might gain or lose some due to different
-    # information at the node level.
-    out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
-    out.values_eq_approx = values_eq_approx
-    return [out]
-
-# Register this here so that it goes after 'local_gpu_conv'
+def local_error_convop(node, context_name):
+    assert False, """
+ConvOp does not work with the gpuarray backend.
+
+Use the new convolution interface to have GPU convolution working:
+theano.tensor.nnet.conv2d()
+"""
+
+
+# This deals with any abstract convs that have a transfer somewhere
+@register_opt('fast_compile')
+@op_lifter([AbstractConv2d,
+            AbstractConv2d_gradWeights,
+            AbstractConv2d_gradInputs])
+def local_lift_abstractconv2d(node, context_name):
+    inps = list(node.inputs)
+    inps[0] = as_gpuarray_variable(node.inputs[0],
+                                   context_name=context_name)
+    inps[1] = as_gpuarray_variable(node.inputs[1],
+                                   context_name=context_name)
+    return [node.op(*inps)]
+
+
+# This will deal with ops that don't have an explicit transfer but
+# have one of their inputs on the GPU already and the other not on the
+# GPU (to avoid endlessly replacing things).
+@register_opt('fast_compile')
+@local_optimizer([AbstractConv2d,
+                  AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_gpu_abstractconv2d(node):
+    if isinstance(node.op, BaseAbstractConv2d):
+        if ((isinstance(node.inputs[0].type, GpuArrayType) or
+             isinstance(node.inputs[1].type, GpuArrayType)) and
+            not (isinstance(node.inputs[0].type, GpuArrayType) or
+                 isinstance(node.inputs[1].type, GpuArrayType))):
+            inps = list(node.inputs)
+            ctx_name = infer_context_name(inps[0], inps[1])
+            inps[0] = as_gpuarray_variable(inps[0], context_name=ctx_name)
+            inps[1] = as_gpuarray_variable(inps[1], context_name=ctx_name)
+            return as_tensor_variable(node.op(*inps))
+
+# Register this here so that it goes after the abstract lifting
 register_opt()(conv_groupopt)



--- a/theano/sandbox/gpuarray/tests/test_abstractconv.py
+++ b/theano/sandbox/gpuarray/tests/test_abstractconv.py
--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py