提交 cea45e8b authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5174 from abergeron/cormm_f16

Make corrMM work in float16/64
......@@ -173,11 +173,15 @@ class Kernel(object):
fname: str
the name of the function wrapper.
(defaults to name + `_call`)
sname: str
the name of the scheduled call function
(defaults to name _ `_scall`)
"""
def __init__(self, code, params, name, flags,
codevar=None, binvar=None, objvar=None, fname=None):
codevar=None, binvar=None, objvar=None, fname=None,
sname=None):
self.code = code
self.params = params
self.name = name
......@@ -194,6 +198,9 @@ class Kernel(object):
if fname is None:
fname = name + '_call'
self.fname = fname
if sname is None:
sname = name + '_scall'
self.sname = sname
@staticmethod
def get_flags(*types):
......@@ -338,22 +345,30 @@ class GpuKernelBase(object):
setargs = '\n '.join(setargs)
return """
int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
{args}) {{
{setargs}
return GpuKernel_call(&{kname}, nd, ldim, gdim, shared, NULL);
return GpuKernel_call(&{kname}, _nd, _ldim, _gdim, _shared, NULL);
}}
""".format(args=args, fname=k.fname, setargs=setargs, kname=k.objvar)
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
size_t _ls = 0;
size_t _gs = 0;
int _err;
if (_nd != 1) return GA_UNSUPPORTED_ERROR;
_err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
if (_err != GA_NO_ERROR)
return _err;
{setargs}
return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
}}
""".format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
kname=k.objvar)
def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name)
......@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
The node that we need the cache version for.
"""
return (6, self.get_params(node).bin_id)
return (7, self.get_params(node).bin_id)
def forward_string_meth(name):
......@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):
kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE)
c_support_code = forward_string_meth('c_support_code')
c_support_code_apply = forward_string_meth('c_support_code_apply')
c_support_code_struct = forward_string_meth('c_support_code_struct')
c_init_code_struct = forward_string_meth('c_init_code_struct')
c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct')
def c_code_cache_version_apply(self, node):
return GpuKernelBase.c_code_cache_version_apply(self, node)
def _type_macros(self, node):
define_template = "#define %s %s\n"
undef_template = "#undef %s\n"
......
......@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
class BaseGpuCorrMM(CGpuKernelBase):
"""
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly.
......@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)).
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation')
_f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1),
filter_dilation=(1, 1)):
......@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
def get_params(self, node):
return node.inputs[0].type.context
def c_headers(self):
return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 2)
# Raise this whenever modifying the code below.
return (2,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
"""
......@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
return [[1], [1], [0], [0]] # no connection to height, width
class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
class BaseGpuCorr3dMM(CGpuKernelBase):
"""
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
......@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
"""
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_dilation')
_f16_ok = True
def __init__(self, border_mode="valid", subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
......@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
def get_params(self, node):
return node.inputs[0].type.context
def c_headers(self):
return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 2)
# raise this whenever modifying the code below.
return (2,)
def c_code_helper(self, bottom, weights, top, direction, sub,
height=None, width=None, depth=None):
......
......@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
......
......@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags=flags, objvar=k_var))
return kernels
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype
......
from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest
import theano.tensor
import theano.gpuarray
if theano.gpuarray.pygpu is None:
......@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
# If using float16, cast reference input to float32
def ref_cast(x):
if x.type.dtype == 'float16':
x = theano.tensor.cast(x, 'float32')
return x
......@@ -17,7 +17,7 @@ from .. import dnn
from ..basic_ops import GpuAllocEmpty
from ..type import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
from . import test_nnet
from .rnn_support import Model, GRU, LSTM, WrapperLayer
......@@ -33,13 +33,6 @@ def set_precision(floatX):
return precision
# If using float16, cast reference input to float32
def ref_cast(x):
if theano.config.floatX == 'float16':
x = T.cast(x, 'float32')
return x
def test_dnn_conv_desc_merge():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
......
......@@ -3,13 +3,14 @@ import unittest
import numpy
import theano
from theano import config
from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
from ..type import gpuarray_shared_constructor
from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu
from .config import mode_with_gpu, mode_without_gpu, ref_cast
class TestCorrMM(unittest.TestCase):
......@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
conv_ref = CorrMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
subsample=subsample)(ref_cast(inputs),
ref_cast(filters))
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorrMM(border_mode=border_mode,
......@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
if (subsample == (1, 1)):
conv_ref = CorrMM_gradWeights(subsample=subsample)(
inputs, dCdH)
ref_cast(inputs), ref_cast(dCdH))
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH)
else:
conv_ref = CorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
ref_cast(inputs), ref_cast(dCdH), shape=shape)
conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
......@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
......@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):
if (subsample == (1, 1)):
conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
kern=ref_cast(filters), topgrad=ref_cast(inputs))
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
else:
conv_ref = CorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
kern=ref_cast(filters), topgrad=ref_cast(inputs),
shape=bottom_shape)
conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
......
......@@ -3,13 +3,14 @@ import unittest
import numpy
import theano
from theano import config
from theano.tests import unittest_tools as utt
from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs
from ..type import gpuarray_shared_constructor
from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
from .config import mode_with_gpu, mode_without_gpu
from .config import mode_with_gpu, mode_without_gpu, ref_cast
class TestCorr3dMM(unittest.TestCase):
......@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
conv_ref = Corr3dMM(border_mode=border_mode,
filter_dilation=filter_dilation,
subsample=subsample)(inputs, filters)
subsample=subsample)(ref_cast(inputs), ref_cast(filters))
f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
conv = GpuCorr3dMM(border_mode=border_mode,
......@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
dCdH = gpuarray_shared_constructor(dCdH_val)
shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
if (subsample == (1, 1, 1)):
conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH)
ref_cast(inputs), ref_cast(dCdH))
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH)
else:
conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
ref_cast(inputs), ref_cast(dCdH), shape=shape)
conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
inputs, dCdH, shape=shape)
......@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
filters_val = numpy.random.random(filters_shape).astype(config.floatX)
inputs = gpuarray_shared_constructor(inputs_val)
filters = gpuarray_shared_constructor(filters_val)
......@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):
if (subsample == (1, 1, 1)):
conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
kern=ref_cast(filters), topgrad=ref_cast(inputs))
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs)
else:
conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape)
conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
kern=filters, topgrad=inputs, shape=bottom_shape)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论