Merge pull request #5174 from abergeron/cormm_f16

Make corrMM work in float16/64

Merge pull request #5174 from abergeron/cormm_f16
cea45e8b · Frédéric Bastien · GitHub · 29af0e5b · d79d38c1 · cea45e8b
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -173,11 +173,15 @@ class Kernel(object):
    fname: str
        the name of the function wrapper.
        (defaults to name + `_call`)
+    sname: str
+        the name of the scheduled call function
+        (defaults to name _ `_scall`)
    """
    def __init__(self, code, params, name, flags,
-                 codevar=None, binvar=None, objvar=None, fname=None):
+                 codevar=None, binvar=None, objvar=None, fname=None,
+                 sname=None):
        self.code = code
        self.params = params
        self.name = name
@@ -194,6 +198,9 @@ class Kernel(object):
        if fname is None:
            fname = name + '_call'
        self.fname = fname
+        if sname is None:
+            sname = name + '_scall'
+        self.sname = sname
    @staticmethod
    def get_flags(*types):
@@ -338,22 +345,30 @@ class GpuKernelBase(object):
        setargs = '\n  '.join(setargs)
        return """
-int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
+int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
                  {args}) {{
  {setargs}
-  return GpuKernel_call(&{kname}, nd, ldim, gdim, shared, NULL);
+  return GpuKernel_call(&{kname}, _nd, _ldim, _gdim, _shared, NULL);
 }}
-        """.format(args=args, fname=k.fname, setargs=setargs, kname=k.objvar)
-    def c_support_code(self):
+int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
-        return """
+  size_t _ls = 0;
-        template <typename T>
+  size_t _gs = 0;
-        static T ceil_intdiv(T a, T b)
+  int _err;
-        {
-            return (a/b) + ((a % b) ? 1: 0);
+  if (_nd != 1) return GA_UNSUPPORTED_ERROR;
-        }
-        """
+  _err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
+  if (_err != GA_NO_ERROR)
+    return _err;
+  {setargs}
+  return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
+}}
+        """.format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
+                   kname=k.objvar)
    def c_support_code_apply(self, node, name):
        kernels = self.gpu_kernels(node, name)
@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
            The node that we need the cache version for.
        """
-        return (6, self.get_params(node).bin_id)
+        return (7, self.get_params(node).bin_id)
 def forward_string_meth(name):
@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):
    kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE)
-    c_support_code = forward_string_meth('c_support_code')
    c_support_code_apply = forward_string_meth('c_support_code_apply')
    c_support_code_struct = forward_string_meth('c_support_code_struct')
    c_init_code_struct = forward_string_meth('c_init_code_struct')
    c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct')
+    def c_code_cache_version_apply(self, node):
+        return GpuKernelBase.c_code_cache_version_apply(self, node)
    def _type_macros(self, node):
        define_template = "#define %s %s\n"
        undef_template = "#undef %s\n"

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
 gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
-class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
+class BaseGpuCorrMM(CGpuKernelBase):
    """
    Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
    `GpuCorrMM_gradInputs`. Cannot be used directly.
@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
    filter_dilation
        Perform subsampling of the input, also known as dilation (default: (1, 1)).
    """
    check_broadcast = False
    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    _f16_ok = True
    def __init__(self, border_mode="valid", subsample=(1, 1),
                 filter_dilation=(1, 1)):
@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
    def get_params(self, node):
        return node.inputs[0].type.context
+    def c_headers(self):
+        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
+        # Raise this whenever modifying the code below.
-        return (0, 2)
+        return (2,)
    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width
-class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
+class BaseGpuCorr3dMM(CGpuKernelBase):
    """
    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
        Perform subsampling of the output (default: (1, 1, 1)).
    filter_dilation
        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
-    """
+    """
    check_broadcast = False
    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    _f16_ok = True
    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
                 filter_dilation=(1, 1, 1)):
@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
    def get_params(self, node):
        return node.inputs[0].type.context
+    def c_headers(self):
+        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
+        # raise this whenever modifying the code below.
-        return (0, 2)
+        return (2,)
    def c_code_helper(self, bottom, weights, top, direction, sub,
                      height=None, width=None, depth=None):

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
+    def c_support_code(self):
+        return """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
    def c_code(self, node, name, inp, out, sub):
        x, = inp
        z, = out

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                              flags=flags, objvar=k_var))
        return kernels
+    def c_support_code(self):
+        return """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
    def c_code(self, node, name, inp, out, sub):
        dtype_ten4 = node.inputs[0].dtype
        dtype_neib_shape = node.inputs[1].dtype

--- a/theano/gpuarray/tests/config.py
+++ b/theano/gpuarray/tests/config.py
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
+import theano.tensor
 import theano.gpuarray
 if theano.gpuarray.pygpu is None:
@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
+# If using float16, cast reference input to float32
+def ref_cast(x):
+    if x.type.dtype == 'float16':
+        x = theano.tensor.cast(x, 'float32')
+    return x
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -17,7 +17,7 @@ from .. import dnn
 from ..basic_ops import GpuAllocEmpty
 from ..type import gpuarray_shared_constructor
-from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
 from . import test_nnet
 from .rnn_support import Model, GRU, LSTM, WrapperLayer
@@ -33,13 +33,6 @@ def set_precision(floatX):
    return precision
-# If using float16, cast reference input to float32
-def ref_cast(x):
-    if theano.config.floatX == 'float16':
-        x = T.cast(x, 'float32')
-    return x
 def test_dnn_conv_desc_merge():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -3,13 +3,14 @@ import unittest
 import numpy
 import theano
+from theano import config
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
-from .config import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu, mode_without_gpu, ref_cast
 class TestCorrMM(unittest.TestCase):
@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)
        conv_ref = CorrMM(border_mode=border_mode,
                          filter_dilation=filter_dilation,
-                          subsample=subsample)(inputs, filters)
+                          subsample=subsample)(ref_cast(inputs),
+                                               ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
        conv = GpuCorrMM(border_mode=border_mode,
@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
        dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        dCdH = gpuarray_shared_constructor(dCdH_val)
        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
        if (subsample == (1, 1)):
            conv_ref = CorrMM_gradWeights(subsample=subsample)(
-                inputs, dCdH)
+                ref_cast(inputs), ref_cast(dCdH))
            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
                inputs, dCdH)
        else:
            conv_ref = CorrMM_gradWeights(subsample=subsample)(
-                inputs, dCdH, shape=shape)
+                ref_cast(inputs), ref_cast(dCdH), shape=shape)
            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
                inputs, dCdH, shape=shape)
@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)
@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):
        if (subsample == (1, 1)):
            conv_ref = CorrMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs))
            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs)
        else:
            conv_ref = CorrMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs, shape=bottom_shape)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs),
+                shape=bottom_shape)
            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs, shape=bottom_shape)

--- a/theano/gpuarray/tests/test_gemmcorr3d.py
+++ b/theano/gpuarray/tests/test_gemmcorr3d.py
@@ -3,13 +3,14 @@ import unittest
 import numpy
 import theano
+from theano import config
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs
 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
-from .config import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu, mode_without_gpu, ref_cast
 class TestCorr3dMM(unittest.TestCase):
@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)
        conv_ref = Corr3dMM(border_mode=border_mode,
                            filter_dilation=filter_dilation,
-                            subsample=subsample)(inputs, filters)
+                            subsample=subsample)(ref_cast(inputs), ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
        conv = GpuCorr3dMM(border_mode=border_mode,
@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
        dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        dCdH = gpuarray_shared_constructor(dCdH_val)
        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
        if (subsample == (1, 1, 1)):
            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
-                inputs, dCdH)
+                ref_cast(inputs), ref_cast(dCdH))
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
                inputs, dCdH)
        else:
            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
-                inputs, dCdH, shape=shape)
+                ref_cast(inputs), ref_cast(dCdH), shape=shape)
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
                inputs, dCdH, shape=shape)
@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)
@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):
        if (subsample == (1, 1, 1)):
            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs))
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs)
        else:
            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs, shape=bottom_shape)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape)
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs, shape=bottom_shape)