提交 87641e6f authored 作者: notoraptor's avatar notoraptor

Remove `AlgoRec.dataType` field as it is used nowhere.

Add some tests, to check at least if everything runs well. Standardize all three files (dnn_fwd, dnn_gi), to have same code logic and organization across these three codes. Add fallback entries as fallback functions. Add `-t` to hash for timed algorithms.
上级 5ce08bc8
...@@ -17,7 +17,6 @@ static inline int cudnnGetVersion() { ...@@ -17,7 +17,6 @@ static inline int cudnnGetVersion() {
/* a common struct for all 3 CUDNN enums */ /* a common struct for all 3 CUDNN enums */
struct AlgoRec { struct AlgoRec {
int algo; int algo;
cudnnDataType_t dataType;
size_t wsSize; size_t wsSize;
cudnnMathType_t mathType; cudnnMathType_t mathType;
}; };
......
#section init_code_struct #section init_code_struct
prev_algo.algo = PARAMS->conv_algo; prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0; reuse_algo = 0;
use_cached = 0;
hash_prefix = std::string("GW|GPU#"); hash_prefix = std::string("GW|GPU#");
#section support_code_struct #section support_code_struct
#line 11 "dnn_gw.c" #line 9 "dnn_gw.c"
int reuse_algo; int reuse_algo;
bool use_cached;
AlgoRec prev_algo; AlgoRec prev_algo;
std::string hash_prefix; std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdFilterAlgo_t algo = *_algo;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradweight algo fft with none)\n");
#endif
}
}
*_algo = algo;
return 0;
}
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...@@ -24,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -24,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
void *beta_p; void *beta_p;
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -82,17 +128,14 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -82,17 +128,14 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups; size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups; size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey ; std::string hashkey ;
size_t free = c_get_largest_free_block_size(c); size_t free = c_get_largest_free_block_size(c);
...@@ -105,11 +148,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -105,11 +148,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups); hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
if (hashkey.empty()) if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1; return 1;
hashkey = hash_prefix + pci_id + hashkey; }
// check out cache hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey); const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) { if (cached) {
prev_algo = *cached; prev_algo = *cached;
...@@ -122,7 +167,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -122,7 +167,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
worksize = prev_algo.wsSize; worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType; mathtype = prev_algo.mathType;
} else { } else {
if (params->choose_time) { if (params->choose_time) {
int count; int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
...@@ -131,6 +175,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -131,6 +175,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1; return -1;
} }
...@@ -152,11 +197,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -152,11 +197,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found"); PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
cuda_exit(c->ctx);
return 1; return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) { } else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradweight algo: %s", "error getting best-timed gradweight algo: %s",
cudnnGetErrorString(choice.status)); cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1; return 1;
} // Else, count is necessarly 1 for current implementation. } // Else, count is necessarly 1 for current implementation.
#endif #endif
...@@ -167,8 +214,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -167,8 +214,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#if CUDNN_MAJOR >= 7 #if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType; prev_algo.mathType = mathtype = choice.mathType;
#endif #endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else { } else {
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
...@@ -189,49 +234,57 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -189,49 +234,57 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} /* choose_algo */ } /* choose_algo */
if (dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set. // if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time)) if (!(reuse_algo || use_cached || params->choose_time))
{ {
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
if (err != CUDNN_STATUS_SUCCESS) { // Fallback to none algo if not supported
#ifdef DEBUG #ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1; return 1;
fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", }
algorithm_name, cudnnGetErrorString(err)); fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
algorithm_name);
#endif #endif
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
}
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
} }
// save for next time/cache if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
prev_algo.wsSize = worksize; // algo may have changed due to fallback, we must update it.
prev_algo.algo = algo; prev_algo.algo = algo;
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache // Add to the cache
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo); dnn_conv_update_cache(hashkey, prev_algo);
} }
#ifdef DEBUG #ifdef DEBUG
if (params->choose_algo) { if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1; return 1;
}
// NB: This is printed only when algorithm is chosen at runtime. // NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n", fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name, algorithm_name,
...@@ -274,9 +327,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -274,9 +327,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < groups; g++) for ( int g = 0; g < groups; g++) {
{
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
params->handle, params->handle,
alpha_p, alpha_p,
...@@ -297,7 +348,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -297,7 +348,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
...@@ -399,7 +399,7 @@ class DnnBase(COp): ...@@ -399,7 +399,7 @@ class DnnBase(COp):
return [] return []
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 1) return (super(DnnBase, self).c_code_cache_version(), version(), 4)
class GpuDnnConvDesc(COp): class GpuDnnConvDesc(COp):
......
...@@ -2287,6 +2287,152 @@ def dconvgi(border_mode, subsample, filter_dilation, num_groups): ...@@ -2287,6 +2287,152 @@ def dconvgi(border_mode, subsample, filter_dilation, num_groups):
return dconvi return dconvi
class TestDnnConv2DRuntimeAlgorithms(object):
ndim = 2
cpu_conv_class = theano.tensor.nnet.corr.CorrMM
runtime_shapes = [
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]), # cache should be used
(1, [(1, 2, 3, 4), (6, 2, 2, 1)])
]
def __init__(self):
self.runtime_algorithms = ('time_once', 'guess_once', 'time_on_shape_change', 'guess_on_shape_change')
def test_fwd_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_fwd_runtime_algorithm(algo):
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
inputs /= 10
filters /= 10
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
f = theano.function([inputs, filters], conv, mode=mode_with_gpu)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
f_ref = theano.function([inputs, filters], conv_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, gpu_res)
for algo in self.runtime_algorithms:
yield (run_fwd_runtime_algorithm, algo)
def test_gradinput_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradinput_runtime_algorithm(algo):
theano.config.dnn.conv.algo_bwd_data = algo
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
inputs /= 10
filters /= 10
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
grad_i = theano.tensor.grad(conv.sum(), [inputs])
f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, dnn.GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs])
f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, gpu_res)
for algo in self.runtime_algorithms:
yield (run_gradinput_runtime_algorithm, algo)
def test_gradweight_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradweight_runtime_algorithm(algo):
theano.config.dnn.conv.algo_bwd_filter = algo
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
inputs /= 10
filters /= 10
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
grad_w = theano.tensor.grad(conv.sum(), [filters])
f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, dnn.GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters])
f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, gpu_res)
for algo in self.runtime_algorithms:
yield (run_gradweight_runtime_algorithm, algo)
class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
ndim = 3
runtime_shapes = [
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]), # cache should be used
(1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)])
]
class Cudnn_grouped_conv(Grouped_conv_noOptim): class Cudnn_grouped_conv(Grouped_conv_noOptim):
mode = mode_with_gpu mode = mode_with_gpu
conv = staticmethod(dconvfwd) conv = staticmethod(dconvfwd)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论