提交 70b1100c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Move the gradients of convolution over to v3 and fix the test for gradI

上级 59dcaf9c
...@@ -533,19 +533,27 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -533,19 +533,27 @@ class GpuDnnConvGradW(DnnBase, COp):
""" """
__props__ = ('inplace',) __props__ = ('algo', 'inplace')
def __init__(self, inplace=False): def __init__(self, inplace=False, algo=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self, 'inplace'): if not hasattr(self, 'inplace'):
self.inplace = False self.inplace = False
if not hasattr(self, 'algo'):
self.algo = config.dnn.conv.algo_bwd
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp img, top, output, desc, alpha, beta = inp
...@@ -566,24 +574,55 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -566,24 +574,55 @@ class GpuDnnConvGradW(DnnBase, COp):
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
defs = []
if self.inplace: if self.inplace:
return [('CONV_INPLACE', '1')] defs.append(('CONV_INPLACE', '1'))
if version() < 3000:
alg = '0'
else: else:
return [] alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'none':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img)
topgrad = as_gpuarray_variable(topgrad) topgrad = as_gpuarray_variable(topgrad)
output = as_gpuarray_variable(output) output = as_gpuarray_variable(output)
if img.type.ndim != 4: if img.type.ndim not in (4, 5):
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D or 5D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim not in (4, 5):
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D or 5D tensor')
if output.type.ndim != 4: if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D tensor') raise TypeError('output must be 4D or 5D tensor')
if not isinstance(desc.type, CDataType) \ if (img.type.ndim != topgrad.type.ndim or
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': img.type.ndim != output.type.ndim):
raise TypeError("The number of dimensions of "
"img, topgrad and output must match")
if img.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
...@@ -609,14 +648,27 @@ class GpuDnnConvGradI(DnnBase): ...@@ -609,14 +648,27 @@ class GpuDnnConvGradI(DnnBase):
""" """
__props__ = ('inplace',) __props__ = ('algo', 'inplace',)
def __init__(self, inplace=False): def __init__(self, inplace=False, algo=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if algo is None:
algo = config.dnn.conv.algo_bwd
self.algo = algo
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'algo'):
self.algo = config.dnn.conv.algo_bwd
if not hasattr(self, 'inplace'):
self.inplace = False
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta = inp kerns, top, output, desc, alpha, beta = inp
...@@ -637,24 +689,55 @@ class GpuDnnConvGradI(DnnBase): ...@@ -637,24 +689,55 @@ class GpuDnnConvGradI(DnnBase):
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1]]
def get_op_params(self): def get_op_params(self):
defs = []
if self.inplace: if self.inplace:
return [('CONV_INPLACE', '1')] defs.append(('CONV_INPLACE', '1'))
if version() < 3000:
alg = '0'
else: else:
return [] alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'none':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
defs.append(('CONV_ALGO', alg))
return defs
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_gpuarray_variable(kern) kern = as_gpuarray_variable(kern)
topgrad = as_gpuarray_variable(topgrad) topgrad = as_gpuarray_variable(topgrad)
output = as_gpuarray_variable(output) output = as_gpuarray_variable(output)
if kern.type.ndim != 4: if kern.type.ndim not in (4, 5):
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D or 5D tensor')
if topgrad.type.ndim != 4: if topgrad.type.ndim not in (4, 5):
raise TypeError('topgrad must be 4D tensor') raise TypeError('topgrad must be 4D or 5D tensor')
if output.type.ndim != 4: if output.type.ndim not in (4, 5):
raise TypeError('output must be 4D tensor') raise TypeError('output must be 4D or 5D tensor')
if not isinstance(desc.type, CDataType) \ if (kern.type.ndim != topgrad.type.ndim or
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': kern.type.ndim != output.type.ndim):
raise TypeError("The number of dimensions of "
"kern, topgrad and output must match")
if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype) alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
...@@ -1638,7 +1721,7 @@ def local_dnn_convgw_inplace(node): ...@@ -1638,7 +1721,7 @@ def local_dnn_convgw_inplace(node):
isinstance(dest.owner.op, GpuAllocEmpty) and isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1): len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradW(inplace=True)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True) @local_optimizer([GpuDnnConvGradI], inplace=True)
...@@ -1651,7 +1734,7 @@ def local_dnn_convgi_inplace(node): ...@@ -1651,7 +1734,7 @@ def local_dnn_convgi_inplace(node):
isinstance(dest.owner.op, GpuAllocEmpty) and isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1): len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradI(inplace=True)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
...@@ -1674,7 +1757,7 @@ def local_dnn_conv_alpha_merge(node, *inputs): ...@@ -1674,7 +1757,7 @@ def local_dnn_conv_alpha_merge(node, *inputs):
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
...@@ -1682,28 +1765,28 @@ def local_dnn_convw_alpha_merge(node, *inputs): ...@@ -1682,28 +1765,28 @@ def local_dnn_convw_alpha_merge(node, *inputs):
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
......
...@@ -12,7 +12,7 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -12,7 +12,7 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#ifdef CUDNN_VERSION > 3000 #if CUDNN_VERSION > 3000
case GA_HALF: case GA_HALF:
dt = CUDNN_DATA_HALF; dt = CUDNN_DATA_HALF;
break; break;
...@@ -64,7 +64,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -64,7 +64,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#ifdef CUDNN_VERSION > 3000 #if CUDNN_VERSION > 3000
case GA_HALF: case GA_HALF:
dt = CUDNN_DATA_HALF; dt = CUDNN_DATA_HALF;
break; break;
......
...@@ -11,12 +11,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -11,12 +11,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
void *beta_p; void *beta_p;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
"GpuDnnConv images and kernel must have the same stack size"); "stack size");
return 1; return 1;
} }
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
...@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -48,19 +49,142 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -48,19 +49,142 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardData( cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_kern_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
return 1;
}
}
err = cudnnConvolutionBackwardData_v3(
_handle, _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
if (worksize != 0)
c->ops->buffer_release(workspace);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
#section support_code_struct #section support_code_struct
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
...@@ -16,9 +16,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -16,9 +16,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
switch (input->ga.typecode) { switch (input->ga.typecode) {
...@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -27,6 +27,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -51,16 +52,140 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -51,16 +52,140 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardFilter( cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
#ifdef CHOOSE_ALGO
static int reuse_algo = 0;
static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_top_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
#endif
#endif
#ifdef CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
#endif
size_t worksize;
gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
return 1;
}
}
err = cudnnConvolutionBackwardFilter_v3(
_handle, _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
if (worksize != 0)
c->ops->buffer_release(workspace);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
...@@ -467,42 +467,41 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -467,42 +467,41 @@ class TestDnnInferShapes(utt.InferShapeTester):
img = T.ftensor4('img') img = T.ftensor4('img')
kerns = T.ftensor4('kerns') kerns = T.ftensor4('kerns')
out = T.ftensor4('out') out = T.ftensor4('out')
img_val = numpy.asarray(
numpy.random.rand(3, 4, 5, 6),
dtype='float32'
)
kern_vals = numpy.asarray( kern_vals = numpy.asarray(
numpy.random.rand(13, 14, 15, 16), numpy.random.rand(13, 14, 15, 16),
dtype='float32' dtype='float32'
) )
out_vals = numpy.asarray(
numpy.random.rand(3, 13, 5, 6),
dtype='float32'
)
for params in product( for params in product(
['valid'], # Should this work for 'full'? ['valid'], # Should this work for 'full'?
[(1, 1)], [(1, 1)],
['conv', 'cross'] ['conv', 'cross']
): ):
temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
shape = ( shape = (
img_val.shape[0], kern_vals.shape[1], out_vals.shape[0], kern_vals.shape[1],
img_val.shape[2] + kern_vals.shape[2] - 1, out_vals.shape[2] + kern_vals.shape[2] - 1,
img_val.shape[3] + kern_vals.shape[3] - 1 out_vals.shape[3] + kern_vals.shape[3] - 1
) )
out_vals = numpy.zeros(shape, dtype='float32') img_vals = numpy.zeros(shape, dtype='float32')
desc = dnn.GpuDnnConvDesc( desc = dnn.GpuDnnConvDesc(
border_mode=params[0], border_mode=params[0],
subsample=params[1], subsample=params[1],
conv_mode=params[2] conv_mode=params[2]
)(out.shape, temp_kerns.shape) )(out.shape, kerns.shape)
conv_grad_i = dnn.GpuDnnConvGradI()( conv_grad_i = dnn.GpuDnnConvGradI()(
temp_kerns, kerns,
img,
out, out,
img,
desc, desc,
) )
self._compile_and_check( self._compile_and_check(
[temp_kerns, img, out], [kerns, img, out],
[conv_grad_i], [conv_grad_i],
[kern_vals, img_val, out_vals], [kern_vals, img_vals, out_vals],
dnn.GpuDnnConvGradI dnn.GpuDnnConvGradI
) )
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论