提交 6cc0c5ca authored 作者: notoraptor's avatar notoraptor

Update check_dnn.py.

- add sub-module check_dnn_doc.py to help check if tests fail as expected from cuDNN documentation. - re-organize whole code into `check_dnn.py`. - add a test case generator. - add classes to help choose algos at runtime without actually computing convolution. - Separate exhaustive tests and tests for choosing algos at runtime.
上级 ca407db4
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from itertools import product, chain from itertools import ifilter, product
import numpy as np import numpy as np
import theano import theano
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.compile.ops import shape_i_op from theano.compile.ops import shape_i_op
from theano.configdefaults import (SUPPORTED_DNN_CONV_ALGO_FWD, SUPPORTED_DNN_CONV3D_ALGO_FWD, from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER, SUPPORTED_DNN_CONV3D_ALGO_BWD_FILTER, from theano.gof import COp, Apply, ParamsType
SUPPORTED_DNN_CONV_ALGO_BWD_DATA, SUPPORTED_DNN_CONV3D_ALGO_BWD_DATA) from theano.gof.type import CDataType
from theano.gpuarray import cudnn_defs
from theano.gpuarray.basic_ops import infer_context_name, as_gpuarray_variable, gpu_contiguous, GpuAllocEmpty
from theano.gpuarray.dnn import (GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision,
DnnBase, handle_type, DNN_CONV_ALGO_CHOOSE_ONCE, DNN_CONV_ALGO_CHOOSE_TIME)
from theano.gpuarray.tests.check_dnn_doc import check_fwd_algorithm
from theano.gpuarray.tests.config import mode_with_gpu, ref_cast
from theano.scalar import bool as bool_t
from theano.tensor.nnet.abstract_conv import get_conv_output_shape, assert_conv_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape, assert_conv_shape
from theano.tensor.opt import Assert from theano.tensor.opt import Assert
from .config import mode_with_gpu, ref_cast
from ..basic_ops import infer_context_name, as_gpuarray_variable, gpu_contiguous, GpuAllocEmpty
from ..dnn import (GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI, version, get_precision)
PRECISIONS = ('float16', 'float32', 'float64') cudnn = cudnn_defs.get_definitions(version(raises=False))
cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
cudnnConvolutionBwdFilterAlgo_t = cudnn.cudnnConvolutionBwdFilterAlgo_t
cudnnConvolutionBwdDataAlgo_t = cudnn.cudnnConvolutionBwdDataAlgo_t
AVAILABLE_PRECISIONS = cudnn.supported_precisions(theano.config.floatX)
class DnnCaseGenerator:
"""
Main class used to generate test cases.
"""
def _sub_size(self, sub_size=None):
return int(sub_size) if sub_size is not None else self.input_size // 3 + 1
def _at_least_one(self, value):
return (value,) if value == 1 else (1, value)
def _shapes(self, size):
# Shapes:
# [1, 1, ...] (at least)
# [size, size, ...]
# [..., size + 2, size + 1, size]
if size == 1:
return ((1,) * self.ndim,
tuple(size + self.ndim - i - 1 for i in range(self.ndim)))
return ((1,) * self.ndim,
(size,) * self.ndim,
tuple(size + self.ndim - i - 1 for i in range(self.ndim)))
def __init__(self,
ndim=2, alpha=2, beta=-3, batch_size=2, input_channels=3, input_size=8, output_channels=2,
filter_size=None, border_size=None, subsample_size=None, dilation_size=None):
self.ndim = int(ndim)
self.alpha = float(alpha)
self.beta = float(beta)
self.batch_size = int(batch_size)
self.input_channels = int(input_channels)
self.input_size = int(input_size)
self.output_channels = int(output_channels)
self.filter_size = self._sub_size(filter_size)
self.border_size = self._sub_size(border_size)
self.subsample_size = self._sub_size(subsample_size)
self.dilation_size = self._sub_size(dilation_size)
assert self.ndim >= 2
assert self.alpha != 0
assert self.batch_size > 0
assert self.input_channels > 0
assert self.input_size > 0
assert self.output_channels > 0
assert self.filter_size > 0
assert self.border_size > 0
assert self.subsample_size > 0
assert self.dilation_size > 0
@staticmethod
def get_if_valid_conv_output_shape(case_tuple):
out_shp = get_conv_output_shape(case_tuple[0][0], # input shape
case_tuple[0][1], # filter shape
case_tuple[1], # border mode
case_tuple[0][2], # subsample
case_tuple[0][3] # dilation
)
try:
return assert_conv_shape(out_shp)
except ValueError:
return False
def get_cases(self):
def get_available_precisions(): # Generate an iterator of tuples with format:
# Starting from floatX up to max supported precision (float64). # ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta )
return PRECISIONS[PRECISIONS.index(theano.config.floatX):] all_batch_sizes = (self.batch_size,)
all_input_channels = (self.input_channels,)
all_input_sizes = self._shapes(self.input_size)
def array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation): all_output_channels = (self.output_channels,)
# Return an random array with inferred convolution output shape. all_filter_sizes = self._shapes(((self.filter_size - 1) * self.dilation_size + 1)
out_shp = get_conv_output_shape(inputs_shape, filters_shape, if cudnn.version < 6
border_mode, else self.filter_size)
subsample, all_subsamples = self._shapes(self.subsample_size)
filter_dilation=dilation) all_dilations = ((1,) * self.ndim,) if cudnn.version < 6 else self._shapes(self.dilation_size)
out_shp = assert_conv_shape(out_shp) all_border_modes = ('valid', 'full', 'half') + self._shapes(self.border_size)
return np.random.random(out_shp).astype(theano.config.floatX) all_conv_modes = ('conv', 'cross')
all_alphas = (self.alpha,)
all_betas = (0,) if self.beta == 0 else (0, self.beta)
all_input_shapes = ((bs, ic) + ins
for bs in all_batch_sizes for ic in all_input_channels for ins in all_input_sizes)
all_filter_shapes = ((oc, ic) + fis
for oc in all_output_channels for ic in all_input_channels for fis in all_filter_sizes)
return ifilter(DnnCaseGenerator.get_if_valid_conv_output_shape,
product(product(all_input_shapes, all_filter_shapes, all_subsamples, all_dilations),
all_border_modes, all_conv_modes, all_alphas, all_betas))
# We provide a special implementation of dnn_conv, dnn_gradweight and dnn_gradinput # We provide a special implementation of dnn_conv, dnn_gradweight and dnn_gradinput
...@@ -123,13 +207,94 @@ def dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mod ...@@ -123,13 +207,94 @@ def dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mod
return GpuDnnConvGradI(algo=algo)(kerns, topgrad, real_out, desc, alpha, beta) return GpuDnnConvGradI(algo=algo)(kerns, topgrad, real_out, desc, alpha, beta)
class BaseGpuDnnConvChooseAlgo(DnnBase):
"""
This class and its subclasses allow to retrieve a cuDNN algorithm
at runtime without any computation, given the user choose option
(time_once, time_on_shape_change, guess_once or guess_on_shape_change).
To help reduce whole test time, I suggest we use these classes when
algo is one of choose options, as any chosen algorithm would have
been tested by the other exhaustive tests.
"""
_f16_ok = True
check_input = False
__props__ = ('choice',)
params_type = ParamsType(choose_once=bool_t, choose_time=bool_t, handle=handle_type)
# Abstract attributes.
func_file = None
func_name = None
def __init__(self, choice):
COp.__init__(self, ["../dnn_base.c", "../dnn_conv_base.c", self.func_file], self.func_name)
assert choice in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.choice = choice
self.choose_once = self.choice in DNN_CONV_ALGO_CHOOSE_ONCE
self.choose_time = self.choice in DNN_CONV_ALGO_CHOOSE_TIME
def dnn_context(self, node):
return node.inputs[0].type.context_name
def _prepare_inputs(self, i1, name_i1, i2, name_i2, output, desc):
ctx_name = infer_context_name(i1, i2, output)
i1 = as_gpuarray_variable(i1, ctx_name)
i2 = as_gpuarray_variable(i2, ctx_name)
output = as_gpuarray_variable(output, ctx_name)
if i1.type.ndim not in (4, 5):
raise TypeError('%s must be 4D or 5D tensor' % name_i1)
if i2.type.ndim not in (4, 5):
raise TypeError('%s must be 4D or 5D tensor' % name_i2)
if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D or 5D tensor')
if i1.type.ndim != i2.type.ndim or i1.type.ndim != output.type.ndim:
raise TypeError("The number of dimensions of %s, %s and output must match" % (name_i1, name_i2))
if not isinstance(desc.type, CDataType) or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
return (i1, i2, output, desc)
class GpuDnnConvChooseFwdAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_fwd.c'
func_name = 'APPLY_SPECIFIC(choose_fwd_algo)'
def make_node(self, img, kern, output, desc):
img, kern, output, desc = self._prepare_inputs(img, 'img', kern, 'kern', output, desc)
return Apply(self, [img, kern, output, desc], [cudnn.cudnnConvolutionFwdAlgo_t()])
class GpuDnnConvChooseBwdFilterAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_gw.c'
func_name = 'APPLY_SPECIFIC(choose_bwd_filter_algo)'
def make_node(self, img, topgrad, output, desc):
img, topgrad, output, desc = self._prepare_inputs(img, 'img', topgrad, 'topgrad', output, desc)
return Apply(self, [img, topgrad, output, desc], [cudnn.cudnnConvolutionBwdFilterAlgo_t()])
class GpuDnnConvChooseBwdDataAlgo(BaseGpuDnnConvChooseAlgo):
func_file = 'dnn_choose_gi.c'
func_name = 'APPLY_SPECIFIC(choose_bwd_data_algo)'
def make_node(self, kern, topgrad, output, desc):
kern, topgrad, output, desc = self._prepare_inputs(kern, 'kern', topgrad, 'topgrad', output, desc)
return Apply(self, [kern, topgrad, output, desc], [cudnn.cudnnConvolutionBwdDataAlgo_t()])
class BaseTestDnnConv(object): class BaseTestDnnConv(object):
"""
Base class for exhaustive tests. Use its subclasses
to run actual tests.
"""
_functions_checked_for_fwd = False _functions_checked_for_fwd = False
_functions_checked_for_gradinput = False _functions_checked_for_gradinput = False
_functions_checked_for_gradweight = False _functions_checked_for_gradweight = False
# Abstract attributes. # Abstract attributes.
ndim = 2
fwd_algorithms = None fwd_algorithms = None
bwd_filter_algorithms = None bwd_filter_algorithms = None
bwd_data_algorithms = None bwd_data_algorithms = None
...@@ -138,14 +303,19 @@ class BaseTestDnnConv(object): ...@@ -138,14 +303,19 @@ class BaseTestDnnConv(object):
cpu_gradinput_class = None cpu_gradinput_class = None
cpu_gradweight_class = None cpu_gradweight_class = None
# Abstract methods.
def get_cases(self): def get_cases(self):
# Should return an iterable of test cases. Each test case is a tuple (or list) with following syntax: # Return an iterable of test cases. Each test case is a tuple (or list) with following syntax:
# ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta ) # ( (input shape, filter shape, subsample, dilation), border mode, convolution mode, alpha, beta )
raise NotImplementedError generator = DnnCaseGenerator(ndim=self.ndim)
return generator.get_cases()
# Run and utility methods.
# Run methods. def array_like_conv_output(self, inputs_shape, filters_shape, border_mode, subsample, dilation):
# Return an random array with inferred convolution output shape.
out_shp = get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation)
out_shp = assert_conv_shape(out_shp)
return np.random.random(out_shp).astype(theano.config.floatX)
def run_conv_fwd(self, algo, precision, parameters): def run_conv_fwd(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters (inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
...@@ -161,7 +331,7 @@ class BaseTestDnnConv(object): ...@@ -161,7 +331,7 @@ class BaseTestDnnConv(object):
inputs = theano.shared(inputs_val) inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val) filters = theano.shared(filters_val)
out = None if beta == 0 else array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, out = None if beta == 0 else self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample,
dilation) dilation)
# Compile a theano function for the cuDNN implementation # Compile a theano function for the cuDNN implementation
conv = dnn_conv(img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode, conv = dnn_conv(img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode,
...@@ -196,13 +366,16 @@ class BaseTestDnnConv(object): ...@@ -196,13 +366,16 @@ class BaseTestDnnConv(object):
# Compare the results of the two implementations # Compare the results of the two implementations
res_ref = f_ref() res_ref = f_ref()
res = f() res = f()
if algo in cudnn.deterministic_fwd_algorithms:
res2 = f()
utt.assert_allclose(res, res2)
# Raise tolerance for float16 # Raise tolerance for float16
rtol = 6e-2 if theano.config.floatX == 'float16' else None rtol = 6e-2 if theano.config.floatX == 'float16' else None
if beta == 0: if beta == 0:
utt.assert_allclose(alpha * res_ref, res, rtol=rtol) utt.assert_allclose(alpha * res_ref, res, rtol=rtol)
else: else:
print('(conv: beta not null) ', end='') # print('(conv: beta not null) ', end='')
utt.assert_allclose(alpha * res_ref + beta * out, res, rtol=rtol) utt.assert_allclose(alpha * res_ref + beta * out, res, rtol=rtol)
def run_conv_gradinput(self, algo, precision, parameters): def run_conv_gradinput(self, algo, precision, parameters):
...@@ -210,7 +383,7 @@ class BaseTestDnnConv(object): ...@@ -210,7 +383,7 @@ class BaseTestDnnConv(object):
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX) inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX) filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation) topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
filters = theano.shared(filters_val) filters = theano.shared(filters_val)
topgrad = theano.shared(topgrad_val) topgrad = theano.shared(topgrad_val)
...@@ -251,8 +424,9 @@ class BaseTestDnnConv(object): ...@@ -251,8 +424,9 @@ class BaseTestDnnConv(object):
# Compare the results of the two implementations # Compare the results of the two implementations
res_ref = f_ref() res_ref = f_ref()
res = f() res = f()
# Needed for big size for some seed if algo in cudnn.deterministic_bwd_data_algorithms:
# raise rtol to make the test pass with more seed. res2 = f()
utt.assert_allclose(res, res2)
# Raise tolerance for float16 # Raise tolerance for float16
rtol = 5e-2 if theano.config.floatX == 'float16' else None rtol = 5e-2 if theano.config.floatX == 'float16' else None
...@@ -263,7 +437,7 @@ class BaseTestDnnConv(object): ...@@ -263,7 +437,7 @@ class BaseTestDnnConv(object):
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX) inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX) filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation) topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
inputs = theano.shared(inputs_val) inputs = theano.shared(inputs_val)
topgrad = theano.shared(topgrad_val) topgrad = theano.shared(topgrad_val)
...@@ -296,137 +470,168 @@ class BaseTestDnnConv(object): ...@@ -296,137 +470,168 @@ class BaseTestDnnConv(object):
# Compare the results of the two implementations # Compare the results of the two implementations
res_ref = f_ref() res_ref = f_ref()
res = f() res = f()
# Needed for big size for some seed if algo in cudnn.deterministic_bwd_filter_algorithms:
# raise rtol to make the test pass with more seed. res2 = f()
utt.assert_allclose(res, res2)
# Raise tolerance for float16 # Raise tolerance for float16
rtol = 5e-2 if theano.config.floatX == 'float16' else None rtol = 5e-2 if theano.config.floatX == 'float16' else None
utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol) utt.assert_allclose(alpha * res_ref + beta * filters_val, res, rtol=rtol)
def run_choose_runtime_algos(self, algo, precision, parameters):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
out_shp = assert_conv_shape(
get_conv_output_shape(inputs_shape, filters_shape, border_mode, subsample, dilation))
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
topgrad = theano.shared(topgrad_val)
ctx_name = infer_context_name(inputs, topgrad)
desc_filter = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(filters_shape)
array_like_filters = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*filters_shape)
array_like_inputs = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*inputs_shape)
array_like_conv_output = GpuAllocEmpty(dtype=inputs.dtype, context_name=ctx_name)(*out_shp)
algo_filter = GpuDnnConvChooseBwdFilterAlgo(algo)(inputs, topgrad, array_like_filters, desc_filter)
algo_input = GpuDnnConvChooseBwdDataAlgo(algo)(filters, topgrad, array_like_inputs, desc_filter)
algo_conv = GpuDnnConvChooseFwdAlgo(algo)(inputs, filters, array_like_conv_output, desc_filter)
f = theano.function([], [algo_filter, algo_input, algo_conv], mode=mode_with_gpu)
# Just test that it runs.
algo_filter_val, algo_input_val, algo_conv_val = f()
# How to test if it "works" ?
def get_expected_tcount(self):
"""
Utility function to get expected test count
without actually run nosetests.
"""
len_cases = 0
for c in self.get_cases():
len_cases += 1
print(len_cases, 'conv cases for %dD' % self.ndim)
return len(AVAILABLE_PRECISIONS) * len_cases * len(self.fwd_algorithms +
self.bwd_data_algorithms +
self.bwd_filter_algorithms +
SUPPORTED_DNN_CONV_ALGO_RUNTIME)
# Iterable test methods. # Iterable test methods.
def test_fwd(self): def test_fwd(self):
for precision in get_available_precisions(): for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms, self.get_cases()):
for algo in self.fwd_algorithms:
for parameters in self.get_cases():
yield (self.run_conv_fwd, algo, precision, parameters) yield (self.run_conv_fwd, algo, precision, parameters)
def test_gradinput(self): def test_gradinput(self):
for precision in get_available_precisions(): for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_data_algorithms, self.get_cases()):
for algo in self.bwd_data_algorithms:
for parameters in self.get_cases():
yield (self.run_conv_gradinput, algo, precision, parameters) yield (self.run_conv_gradinput, algo, precision, parameters)
def test_gradweight(self): def test_gradweight(self):
for precision in get_available_precisions(): for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.bwd_filter_algorithms, self.get_cases()):
for algo in self.bwd_filter_algorithms:
for parameters in self.get_cases():
yield (self.run_conv_gradweight, algo, precision, parameters) yield (self.run_conv_gradweight, algo, precision, parameters)
def test_choose_runtime_algos(self):
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, SUPPORTED_DNN_CONV_ALGO_RUNTIME,
self.get_cases()):
yield (self.run_choose_runtime_algos, algo, precision, parameters)
def check_fwd_predictions(self):
"""
Call this method to check if tests fail when they
don't follow cuDNN V5.1 doc conditions for FWD algorithms.
Script will exit as soon as there is a test that does not fail when expected.
"""
print()
print('TESTING FWD FAILURES PREDICTED FOR %dD' % self.ndim)
count = 0
for precision, algo, parameters in product(AVAILABLE_PRECISIONS, self.fwd_algorithms,
self.get_cases()):
(inputs_shape, filters_shape, subsample, dilation), border_mode, conv_mode, alpha, beta = parameters
inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val /= 10
filters_val /= 10
out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation)
desc_op = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)
should_compute = check_fwd_algorithm(inputs_val, filters_val, out, desc_op,
algo, precision, subsample, dilation)
if not should_compute.ok:
infos = ['ndim : %s' % (len(inputs_shape) - 2),
'precision : %s' % precision]
infos += should_compute.messages
try:
self.run_conv_fwd(algo, precision, parameters)
except Exception as e:
print('(FAILS as expected)', algo, precision, parameters)
print(e.message.split('\n')[0])
for info in infos:
print(info)
# exit(0)
else:
print('**SHOULD FAIL**|', algo, precision, parameters)
for info in infos:
print(info)
exit(-1)
count += 1
if count % 200 == 0:
print(count, 'passed')
print(count, 'finished')
class TestDnnConv2D(BaseTestDnnConv): class TestDnnConv2D(BaseTestDnnConv):
fwd_algorithms = SUPPORTED_DNN_CONV_ALGO_FWD ndim = 2
bwd_filter_algorithms = SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
bwd_data_algorithms = SUPPORTED_DNN_CONV_ALGO_BWD_DATA fwd_algorithms = cudnn.cudnnConvolutionFwdAlgo_t.get_aliases()
bwd_filter_algorithms = cudnn.cudnnConvolutionBwdFilterAlgo_t.get_aliases()
bwd_data_algorithms = cudnn.cudnnConvolutionBwdDataAlgo_t.get_aliases()
cpu_conv_class = theano.tensor.nnet.corr.CorrMM cpu_conv_class = theano.tensor.nnet.corr.CorrMM
cpu_gradinput_class = theano.tensor.nnet.corr.CorrMM_gradInputs cpu_gradinput_class = theano.tensor.nnet.corr.CorrMM_gradInputs
cpu_gradweight_class = theano.tensor.nnet.corr.CorrMM_gradWeights cpu_gradweight_class = theano.tensor.nnet.corr.CorrMM_gradWeights
def get_cases(self):
# Inspired from:
# - theano.tensor.nnet.tests.test_abstract_conv.BaseTestConv2d#setup_class
# - theano.tensor.nnet.tests.test_abstract_conv.BaseTestConv#test_all
inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
(6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9),
(0, 1, 6, 6), (1, 0, 6, 6), (1, 1, 6, 6)]
filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
(1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2),
(1, 1, 2, 2), (1, 0, 2, 2), (0, 1, 2, 2)]
subsamples = [(1, 1), (2, 2), (2, 4)]
dilations = [(1, 1), (1, 2), (2, 1)]
default_subsample = (1, 1)
default_dilation = (1, 1)
border_modes = ["valid", "half", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
conv_modes = ['conv', 'cross']
assert len(inputs_shapes) == len(filters_shapes)
iterables = []
for input_shape, filter_shape in zip(inputs_shapes, filters_shapes):
if 0 not in input_shape and 0 not in filter_shape:
local_subsamples = subsamples
local_dilations = dilations
else:
local_subsamples = [default_subsample]
local_dilations = [default_dilation]
iterables += [product(product([input_shape], [filter_shape], local_subsamples, local_dilations),
border_modes,
conv_modes, [1], [0])]
return chain(*iterables)
class TestDnnConv3D(BaseTestDnnConv): class TestDnnConv3D(BaseTestDnnConv):
fwd_algorithms = SUPPORTED_DNN_CONV3D_ALGO_FWD ndim = 3
bwd_filter_algorithms = SUPPORTED_DNN_CONV3D_ALGO_BWD_FILTER
bwd_data_algorithms = SUPPORTED_DNN_CONV3D_ALGO_BWD_DATA fwd_algorithms = cudnn.conv3d_fwd_algorithms
bwd_filter_algorithms = cudnn.conv3d_bwd_filter_algorithms
bwd_data_algorithms = cudnn.conv3d_bwd_data_algorithms
cpu_conv_class = theano.tensor.nnet.corr3d.Corr3dMM cpu_conv_class = theano.tensor.nnet.corr3d.Corr3dMM
cpu_gradinput_class = theano.tensor.nnet.corr3d.Corr3dMM_gradInputs cpu_gradinput_class = theano.tensor.nnet.corr3d.Corr3dMM_gradInputs
cpu_gradweight_class = theano.tensor.nnet.corr3d.Corr3dMM_gradWeights cpu_gradweight_class = theano.tensor.nnet.corr3d.Corr3dMM_gradWeights
def get_cases(self):
# small case for quick test. if __name__ == '__main__':
input_shape = (128, 3, 5, 5, 5) test_2d = TestDnnConv2D()
filter_shape = (64, 3, 1, 2, 4) test_3d = TestDnnConv3D()
subsample = (1, 1, 1) print('2D algorithms:')
dilation = (1, 1, 1) print('FWD :', test_2d.fwd_algorithms)
border_mode = 'valid' print('BWD FILTER:', test_2d.bwd_filter_algorithms)
conv_mode = 'conv' print('BWD DATA :', test_2d.bwd_data_algorithms)
return (((input_shape, filter_shape, subsample, dilation), border_mode, conv_mode, 2.1, -5.7),) print('3D algorithms:')
print('FWD :', test_3d.fwd_algorithms)
def get_cases_real(self): print('BWD FILTER:', test_3d.bwd_filter_algorithms)
# Copy of: theano.gpuarray.tests.test_dnn.get_conv3d_test_cases print('BWD DATA :', test_3d.bwd_data_algorithms)
count_tests_2d = test_2d.get_expected_tcount()
# Every element of test_shapes follows the format count_tests_3d = test_3d.get_expected_tcount()
# [input_shape, filter_shape, subsample, dilation] print(count_tests_2d, 'total cases for 2D.')
test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1), (1, 1, 1)], print(count_tests_3d, 'total cases for 3D.')
[(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2), (1, 1, 1)], print(count_tests_2d + count_tests_3d, 'total cases.')
[(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3), (1, 1, 1)], import sys
[(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1), (1, 1, 1)],
# Test with 1x1x1 filters if len(sys.argv) == 2 and sys.argv[1] == 'run':
[(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1), (1, 1, 1)], test_2d.check_fwd_predictions()
# Test with dimensions larger than 1024 (thread block dim) test_3d.check_fwd_predictions()
[(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1), (1, 1, 1)],
[(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1), (1, 1, 1)],
[(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1), (1, 1, 1)],
[(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1), (1, 1, 1)],
[(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1), (1, 1, 1)],
[(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1), (1, 1, 1)],
# The equivalent of this caused a crash with conv2d
[(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1), (1, 1, 1)]]
# With border mode 'full', test with kernel bigger than image in some/all
# dimensions
test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1), (1, 1, 1)],
[(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1), (1, 1, 1)],
[(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1), (1, 1, 1)],
[(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1), (1, 1, 1)]]
if version() >= 6000:
test_shapes.extend([
[(8, 1, 20, 12, 15), (5, 1, 6, 3, 4), (1, 1, 2), (3, 2, 1)],
[(8, 1, 20, 12, 15), (5, 1, 6, 3, 4), (2, 2, 1), (1, 2, 3)]])
test_shapes_full.append(
[(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1), (3, 2, 1)])
border_modes = ['valid', 'full', 'half', (1, 2, 3), (3, 2, 1), 1, 2]
conv_modes = ['conv', 'cross']
itt = chain(product(test_shapes, border_modes, conv_modes),
product(test_shapes_full, ['full'], conv_modes))
return itt
"""
This module is just a collection of definitions to be used by `check_dnn.py`.
Following classes, functions and definitions are used to check if
tests fail as expected when conditions listed into cuDNN documentation are not verified.
I have currently implemented checking only for 2D/3D FWD algorithms in cuDNN V5.1,
and in practice, many tests pass even when they don't follow cuDNN doc conditions.
So, I think we should better just run all tests and check ourselves
which tests pass, which fail, and why they fail.
Reminder:
N: batch number
C: number of feature maps
D: depth
H: height
W: width
NB: We assume that we **always** use NC(D)HW tensors in Theano.
"""
from __future__ import absolute_import, print_function, division
import theano
from ..cudnn_defs import HALF, FLOAT, DOUBLE, get_definitions
from ..dnn import version
UNKNOWN, TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG = -1, 0, 1, 2, 3
cudnn = get_definitions(version(raises=False))
cudnnConvolutionFwdAlgo_t = cudnn.cudnnConvolutionFwdAlgo_t
class Success:
ok = True
messages = []
def __init__(self, messages=[]):
self.messages = list(messages)
def add_message(self, *parts):
self.messages.append(''.join(str(part) for part in parts))
class Failure(Success):
ok = False
def _and(*tests):
# `tests` is a list of tuples with format (lambda test, test description)
messages = []
for test_lambda, message in tests:
if not test_lambda():
messages.append(message)
return Failure(messages) if messages else Success()
def _or(*tests):
messages = []
ok = False
for test_lambda, message in tests:
if test_lambda():
ok = True
break
else:
messages.append(message)
return Success() if ok else Failure(messages)
def type_conf(precision):
# All Op's input tensors are floatX tensors.
floatX = theano.config.floatX
if floatX == precision == HALF:
return TRUE_HALF_CONFIG
if floatX == HALF and precision == FLOAT:
return PSEUDO_HALF_CONFIG
if floatX == precision == FLOAT:
return FLOAT_CONFIG
if floatX == precision == DOUBLE:
return DOUBLE_CONFIG
return UNKNOWN
# raise ValueError('Unknown data type configuration (%s %s)' % (floatX, precision))
def type_conf_to_string(conf):
if conf == -1:
return 'UNKNOWN'
if conf == 0:
return 'TRUE_HALF_CONFIG'
if conf == 1:
return 'PSEUDO_HALF_CONFIG'
if conf == 2:
return 'FLOAT_CONFIG'
if conf == 3:
return 'DOUBLE_CONFIG'
def strideof(tensor, i):
return tensor.strides[i] // tensor.itemsize
def tensor_is_partially_packed(tensor, packed_dim_names):
if tensor.ndim == 4:
dim_names = 'NCHW'
else:
dim_names = 'NCDHW'
packed_dims = []
unpacked_dims = []
for i in range(tensor.ndim - 1):
if dim_names[i] in packed_dim_names:
packed_dims.append(i)
else:
unpacked_dims.append(i)
if dim_names[tensor.ndim - 1] in packed_dim_names and strideof(tensor, -1) != 1:
# We won't put last dimension in the list of packed dims.
# We just need to check if stride of that dimension is 1.
return False
return (all(strideof(tensor, i) >= tensor.shape[i + 1] * strideof(tensor, i + 1) for i in unpacked_dims) and
all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1) for i in packed_dims))
def tensor_is_fully_packed(tensor):
return strideof(tensor, -1) == 1 and all(strideof(tensor, i) == tensor.shape[i + 1] * strideof(tensor, i + 1)
for i in range(tensor.ndim - 1))
def check_fwd_algorithm(img, kern, out, desc_op, algo, precision, subsample, dilation):
# Based on cuDNN v5.1 user guide.
ndim = img.ndim - 2
if ndim == 2:
# rD won't be used.
rD, rH, rW = -1, 0, 1
else:
rD, rH, rW = 0, 1, 2
algo = cudnnConvolutionFwdAlgo_t.fromalias(algo)
kern_shape = kern.shape[2:]
kern_shape = tuple((kern_shape[i] - 1) * dilation[i] + 1 for i in range(len(dilation)))
pad = (desc_op.pad0, desc_op.pad1, desc_op.pad2)[:len(kern_shape)]
if desc_op.bmode == 'full':
pad = tuple(kern_shape[i] - 1 for i in range(len(pad)))
elif desc_op.bmode == 'half':
pad = tuple(kern_shape[i] // 2 for i in range(len(pad)))
img_shape = img.shape[2:]
img_with_borders = tuple(img_shape[i] + 2 * pad[i] for i in range(len(pad)))
def check_algo():
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
return _and((lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"))
# CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: 2D: everything supported.
if ndim == 3 and algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: tensor_is_fully_packed(img),
"xDesc Format Support: NCDHW-fully-packed"),
(lambda: tensor_is_fully_packed(out),
"yDesc Format Support: NCDHW-fully-packed"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d")
)
# CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: tensor_is_partially_packed(img, 'HW'),
"xDesc Format Support: NCHW HW-packed"),
(lambda: tensor_is_partially_packed(out, 'HW'),
"yDesc Format Support: NCHW HW-packed"),
(lambda: img_with_borders[rH] <= 256,
"xDesc 's feature map height + 2 * convDesc 's zero-padding height must equal 256 or less"),
(lambda: img_with_borders[rW] <= 256,
"xDesc 's feature map width + 2 * convDesc 's zero-padding width must equal 256 or less"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] > pad[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: kern_shape[rW] > pad[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width")
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
if ndim == 2:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: tensor_is_partially_packed(img, 'HW'),
"xDesc Format Support: NCHW HW-packed"),
(lambda: tensor_is_partially_packed(out, 'HW'),
"yDesc Format Support: NCHW HW-packed"),
(lambda: kern_shape[rH] <= 32,
"wDesc 's filter height must equal 32 or less"),
(lambda: kern_shape[rW] <= 32,
"wDesc 's filter width must equal 32 or less"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: pad[rH] < kern_shape[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: pad[rW] < kern_shape[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width"),
)
if ndim == 3:
return _and(
(lambda: type_conf(precision) != TRUE_HALF_CONFIG,
"Data Type Config Support: All except TRUE_HALF_CONFIG"),
(lambda: tensor_is_partially_packed(img, 'DHW'),
"xDesc Format Support: NCDHW DHW-packed"),
(lambda: tensor_is_partially_packed(out, 'DHW'),
"yDesc Format Support: NCDHW DHW-packed"),
(lambda: kern_shape[rH] <= 16,
"wDesc 's filter height must equal 16 or less"),
(lambda: kern_shape[rW] <= 16,
"wDesc 's filter width must equal 16 or less"),
(lambda: kern_shape[rD] <= 16,
"wDesc 's filter depth must equal 16 or less"),
(lambda: all(s == 1 for s in subsample),
"convDesc 's must have all filter strides equal to 1"),
(lambda: pad[rH] < kern_shape[rH],
"wDesc 's filter height must be greater than convDesc 's zero-padding height"),
(lambda: pad[rW] < kern_shape[rW],
"wDesc 's filter width must be greater than convDesc 's zero-padding width"),
(lambda: pad[rW] < kern_shape[rD],
"wDesc 's filter depth must be greater than convDesc 's zero-padding width"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
return _and(
(lambda: type_conf(precision) in (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG),
"Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] == 3,
"wDesc 's filter height must be 3"),
(lambda: kern_shape[rW] == 3,
"wDesc 's filter width must be 3"),
)
if algo == cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
data_type_conf = type_conf(precision)
return _and(
(lambda: data_type_conf != DOUBLE_CONFIG,
"Data Type Config Support: All except DOUBLE_CONFIG"),
(lambda: ndim == 2,
"Only for conv2d"),
(lambda: subsample[rH] == subsample[rW] == 1,
"convDesc 's vertical and horizontal filter stride must equal 1"),
(lambda: kern_shape[rH] == kern_shape[rW] and kern_shape[rH] in (3, 5),
"wDesc 's filter (height, width) must be (3,3) or (5,5)"),
(lambda: kern_shape[rH] == 3 or data_type_conf != TRUE_HALF_CONFIG,
"If wDesc 's filter (height, width) is (5,5), "
"data type config TRUE_HALF_CONFIG is not supported")
)
checking = check_algo()
if not checking.ok:
messages = checking.messages
checking.messages = []
checking.add_message('config : ', type_conf_to_string(type_conf(precision)))
checking.add_message('computed borders : ', pad)
checking.add_message('img with borders : ', img_with_borders)
checking.add_message('computed kern shape: ', kern_shape)
checking.add_message('== why should fail ==')
checking.messages += messages
return checking
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionFwdAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
int
APPLY_SPECIFIC(choose_fwd_algo)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *output,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionFwdAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_data_algo)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *input,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdDataAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = kerns->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdDataAlgo_t algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); ++i) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
#section init_code_struct
reuse_algo = 0;
prev_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
if (!PARAMS->choose_once) {
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
#section support_code_struct
int reuse_algo;
cudnnConvolutionBwdFilterAlgo_t prev_algo;
size_t prev_img_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(choose_bwd_filter_algo)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
cudnnConvolutionBwdFilterAlgo_t *output_algo,
PARAMS_TYPE* params) {
PyGpuContextObject *c = input->context;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
cudnnConvolutionBwdFilterAlgo_t algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); ++i) {
reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
prev_algo = algo;
} else {
algo = prev_algo;
}
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
*output_algo = algo;
cuda_exit(c->ctx);
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论