提交 0ef5d60b authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Three changes:

- Split the C code apart in between the Conv ops (the previous solution is not workable with the new interface). - Move the C code out into separate files using COp. - Make cudnn_helper.h provide wrappers exposing the R2 api with the R1 functions when possible to reduce #ifdef maze in actual function code.
上级 efe63023
...@@ -3,7 +3,12 @@ ...@@ -3,7 +3,12 @@
#include <cudnn.h> #include <cudnn.h>
inline const char *cudnnGetErrorString(cudnnStatus_t err) { #ifndef CUDNN_VERSION
// Here we define the R2 API in terms of functions in the R1 interface
// This is only for what we use
static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
switch (err) { switch (err) {
case CUDNN_STATUS_SUCCESS: case CUDNN_STATUS_SUCCESS:
return "The operation completed successfully."; return "The operation completed successfully.";
...@@ -28,22 +33,110 @@ inline const char *cudnnGetErrorString(cudnnStatus_t err) { ...@@ -28,22 +33,110 @@ inline const char *cudnnGetErrorString(cudnnStatus_t err) {
} }
} }
static inline const int cudnnVersionMacro(){ // some macros to help support cudnn R1 while using R2 code.
#ifdef CUDNN_VERSION
return CUDNN_VERSION;
#else
//CUDNN_VERSION undefined, you probably use cuDNN R1 version
return -1;
#endif
}
//some macro to help support cudnn R1 while using R2 code.
#ifndef CUDNN_VERSION
#define cudnnTensorDescriptor_t cudnnTensor4dDescriptor_t
#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor #define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor #define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor #define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
typedef cudnnTensorDescriptor_t cudnnTensor4dDescriptor_t;
static inline cudnnStatus_t
cdnnGetConvolution2dForwardOutputDim(
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t inputTensorDesc,
const cudnnFilterDescriptor_t filterDesc,
int *n,
int *c,
int *h,
int *w) {
return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
n, c, h, w);
}
typedef int cudnnConvolutionFwdAlgo_t;
static inline cudnnStatus_t
cudnnGetConvolutionForwardAlgorithm(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t destDesc,
cudnnConvolutionFwdPreference_t preference,
size_t memoryLimitInbytes,
cudnnConvolutionFwdAlgo_t *algo) {
*algo = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnConvolutionForward_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDest,
const void *srcData,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 0.0);
return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData,
convDesc, destDesc, destData,
CUDNN_RESULT_NO_ACCUMULATE);
}
#define cudnnConvolutionForward cudnnConvolutionForward_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardFilter_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 0.0);
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
CUDNN_RESULT_NO_ACCUMULATE);
}
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardData_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t filterDesc,
const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
assert(*(float *)beta == 0.0);
return cudnnConvolutionBackwardFilter(handle, filterDesc, filterData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
CUDNN_RESULT_NO_ACCUMULATE);
}
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
#endif #endif
#endif #endif
...@@ -2,8 +2,9 @@ import os ...@@ -2,8 +2,9 @@ import os
import theano import theano
from theano import Apply, gof, tensor from theano import Apply, gof, tensor
from theano.scalar import as_scalar
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
from theano.gof import Optimizer, local_optimizer from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compat import PY3 from theano.compat import PY3
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
...@@ -102,7 +103,7 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) { ...@@ -102,7 +103,7 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
""" % dict(var=var, err=err, desc=desc, fail=fail) """ % dict(var=var, err=err, desc=desc, fail=fail)
class DnnBase(GpuOp): class DnnBase(GpuOp, COp):
""" """
Creates a handle for cudnn and pulls in the cudnn libraries and headers. Creates a handle for cudnn and pulls in the cudnn libraries and headers.
""" """
...@@ -110,6 +111,9 @@ class DnnBase(GpuOp): ...@@ -110,6 +111,9 @@ class DnnBase(GpuOp):
# the input broadcasting pattern. # the input broadcasting pattern.
check_broadcast = False check_broadcast = False
def __init__(self):
COp.__init__(self, "dnn_base.c")
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h'] return ['cudnn.h', 'cudnn_helper.h']
...@@ -119,11 +123,6 @@ class DnnBase(GpuOp): ...@@ -119,11 +123,6 @@ class DnnBase(GpuOp):
def c_libraries(self): def c_libraries(self):
return ['cudnn'] return ['cudnn']
def c_support_code(self):
return """
cudnnHandle_t _handle = NULL;
"""
def c_init_code(self): def c_init_code(self):
if PY3: if PY3:
error_out = "NULL" error_out = "NULL"
...@@ -139,20 +138,26 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -139,20 +138,26 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
}""" % (error_out,)] }""" % (error_out,)]
class DnnVersion(DnnBase): class DnnVersion(GpuOp):
def c_compiler(self): def c_compiler(self):
return NVCC_compiler return NVCC_compiler
def c_headers(self):
return ['cudnn.h']
def c_libraries(self):
return ['cudnn']
def make_node(self): def make_node(self):
return Apply(self, [], [Generic()()]) return Apply(self, [], [Generic()()])
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0] o = outputs[0]
return """ return """
#if CUDNN_VERSION >= 20 #if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(cudnnVersionMacro()), PyInt_FromLong(cudnnGetVersion())); %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else #else
%(o)s = PyInt_FromLong(cudnnVersionMacro()); %(o)s = PyInt_FromLong(-1);
#endif #endif
""" % locals() """ % locals()
...@@ -274,7 +279,7 @@ class GpuDnnConvDesc(GpuOp): ...@@ -274,7 +279,7 @@ class GpuDnnConvDesc(GpuOp):
PyErr_SetString(PyExc_ValueError, "bad border mode"); PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s %(fail)s
} }
#if CUDNN_VERSION >= 20 #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 20
err = cudnnSetConvolution2dDescriptor( err = cudnnSetConvolution2dDescriptor(
%(desc)s, %(desc)s,
pad_h%(name)s, pad_h%(name)s,
...@@ -313,223 +318,7 @@ class GpuDnnConvDesc(GpuOp): ...@@ -313,223 +318,7 @@ class GpuDnnConvDesc(GpuOp):
return (2, version()) return (2, version())
class GpuDnnConvBase(DnnBase): class GpuDnnConv(DnnBase, COp):
__props__ = ()
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t output%(name)s;
cudnnFilterDescriptor_t kerns%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
output%(name)s = NULL;
kerns%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateFilterDescriptor(&kerns%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) {cudnnDestroyTensorDescriptor(input%(name)s);}
if (output%(name)s != NULL) {cudnnDestroyTensorDescriptor(output%(name)s);}
if (kerns%(name)s != NULL) {cudnnDestroyFilterDescriptor(kerns%(name)s);}
""" % dict(name=name)
def c_set_filter(self, var, desc, err, fail):
return """
%(err)s = cudnnSetFilter4dDescriptor(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3]
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"%(cls)s: could not set filter descriptor: %%s."
" dims= %%d %%d %%d %%d",
cudnnGetErrorString(%(err)s),
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3]);
%(fail)s
}
""" % dict(var=var, desc=desc, err=err, fail=fail,
cls=self.__class__.__name__)
def c_set_tensor4d(self, *arg):
return c_set_tensor4d(*arg)
def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[2]
if len(inputs) <= 3:
height, width = (-1, -1)
else:
height, width = inputs[3:]
height = '(*(npy_%s*)(PyArray_DATA(%s)))' % (
node.inputs[3].dtype, height)
width = '(*(npy_%s*)(PyArray_DATA(%s)))' % (
node.inputs[4].dtype, width)
out, = outputs
checks = []
for v in inputs[:2]:
checks.append("""
if (!CudaNdarray_is_c_contiguous(%s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%s
}
""" % (v, sub['fail']))
sets = []
for p, v, d in zip(inputs[:2], self.conv_inputs, self.conv_types[:2]):
sets.append(getattr(self, 'c_set_'+d)(p, v + name,
'err' + name, sub['fail']))
set_out = getattr(self, 'c_set_' + self.conv_types[2])(
out, self.conv_output + name, 'err' + name,
sub['fail'])
return """
cudnnStatus_t err%(name)s = CUDNN_STATUS_SUCCESS;
%(checks)s
%(sets)s
{
int out_dims[4];
#ifndef CUDNN_VERSION
err%(name)s = cudnnGetOutputTensor4dDim(
%(desc)s, %(path)s,
&out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not get output sizes: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
// workaround for cudnn R1 bug
if (%(path)s == CUDNN_CONVOLUTION_WEIGHT_GRAD &&
(out_dims[0] != CudaNdarray_HOST_DIMS(%(input2)s)[1] ||
out_dims[1] != CudaNdarray_HOST_DIMS(%(input1)s)[1])) {
out_dims[0] = CudaNdarray_HOST_DIMS(%(input2)s)[1];
out_dims[1] = CudaNdarray_HOST_DIMS(%(input1)s)[1];
// This is a horrible hack that is unfortulately necessary
int *dd = (int *)%(desc)s;
out_dims[2] = dd[5];
out_dims[3] = dd[6];
}
#else
if (!%(full)d){
err%(name)s = cudnnGetConvolution2dForwardOutputDim(
%(desc)s,
input%(id)d,
kerns%(id)d,
&out_dims[0], &out_dims[1],&out_dims[2], &out_dims[3]);
}else{
int padH=0, padW=0, dH=1, dW=1, upscalex=1, upscaley=1;
cudnnConvolutionMode_t mode=CUDNN_CONVOLUTION;
err%(name)s = cudnnGetConvolution2dDescriptor(
%(desc)s, &padW, &padH, &dH, &dW,
&upscalex, &upscaley, &mode);
out_dims[0] = CudaNdarray_HOST_DIMS(%(input2)s)[0];
out_dims[1] = CudaNdarray_HOST_DIMS(%(input1)s)[1];
out_dims[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(%(input1)s)[2] - 1) * dH + CudaNdarray_HOST_DIMS(%(input2)s)[2] - 2*padH;
out_dims[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(%(input1)s)[3] - 1) * dW + CudaNdarray_HOST_DIMS(%(input2)s)[3] - 2*padW;
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"%(cls)s, error while computing the output shape: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s
}
}
%(set_out)s
#ifndef CUDNN_VERSION
err%(name)s = %(method)s(
_handle,
%(input1_desc)s, CudaNdarray_DEV_DATA(%(input1)s),
%(input2_desc)s, CudaNdarray_DEV_DATA(%(input2)s),
%(desc)s,
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s),
CUDNN_RESULT_NO_ACCUMULATE
);
#else
{
const float alpha = 1;
const float beta = 0;
/*
cudnnGetConvolutionForwardAlgorithm(
_handle,
%(input1_desc)s,
%(input2_desc)s,
%(desc)s,
%(output_desc)s,
CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, //TODO, config of this
0, //TODO, memoryLimitInbytes,
cudnnConvolutionFwdAlgo_t
);
*/
err%(name)s = %(method)s(
_handle,
(void*)&alpha,
%(input1_desc)s, CudaNdarray_DEV_DATA(%(input1)s),
%(input2_desc)s, CudaNdarray_DEV_DATA(%(input2)s),
%(desc)s,
%(algo)s
(void*)&beta,
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "%(cls)s, error doing operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(out=out, desc=desc, fail=sub['fail'],
name=name, checks='\n'.join(checks), sets='\n'.join(sets),
set_out=set_out, input1=inputs[0], input2=inputs[1],
input1_desc=self.conv_inputs[0]+name,
input2_desc=self.conv_inputs[1]+name,
output_desc=self.conv_output+name,
height=height, width=width,
cls=self.__class__.__name__,
full=int("GpuDnnConvGradI" == self.__class__.__name__),
method=self.conv_op, path=self.path_flag, algo=self.algo)
def c_code_cache_version(self):
return (10, version())
class GpuDnnConv(GpuDnnConvBase):
""" """
The forward convolution. The forward convolution.
...@@ -538,14 +327,11 @@ class GpuDnnConv(GpuDnnConvBase): ...@@ -538,14 +327,11 @@ class GpuDnnConv(GpuDnnConvBase):
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
conv_inputs = 'input', 'kerns' __props__ = ()
conv_output = 'output'
conv_types = 'tensor4d', 'filter', 'tensor4d' def __init__(self):
conv_op = 'cudnnConvolutionForward' COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
path_flag = 'CUDNN_CONVOLUTION_FWD' "APPLY_SPECIFIC(conv_fwd)")
algo = """CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, //TODO: algo,
NULL,//TODO, void *workspace,
0, //TODO: workspacesize"""
def make_node(self, img, kern, desc): def make_node(self, img, kern, desc):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
...@@ -571,8 +357,10 @@ class GpuDnnConv(GpuDnnConvBase): ...@@ -571,8 +357,10 @@ class GpuDnnConv(GpuDnnConvBase):
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, desc, img.shape[-2:]) d_img = GpuDnnConvGradI()(kerns, top, desc,
d_kerns = GpuDnnConvGradW()(img, top, desc) img.shape[2], img.shape[3])
d_kerns = GpuDnnConvGradW()(img, top, desc,
kerns.shape[2], kerns.shape[3])
return d_img, d_kerns, theano.gradient.DisconnectedType()() return d_img, d_kerns, theano.gradient.DisconnectedType()()
...@@ -581,7 +369,7 @@ class GpuDnnConv(GpuDnnConvBase): ...@@ -581,7 +369,7 @@ class GpuDnnConv(GpuDnnConvBase):
return [[1], [1], [0]] return [[1], [1], [0]]
class GpuDnnConvGradW(GpuDnnConvBase): class GpuDnnConvGradW(DnnBase, COp):
""" """
The convolution gradient with respect to the weights. The convolution gradient with respect to the weights.
...@@ -590,30 +378,30 @@ class GpuDnnConvGradW(GpuDnnConvBase): ...@@ -590,30 +378,30 @@ class GpuDnnConvGradW(GpuDnnConvBase):
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = ()
conv_inputs = 'input', 'output', def __init__(self):
conv_output = 'kerns' COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
conv_types = 'tensor4d', 'tensor4d', 'filter' "APPLY_SPECIFIC(conv_gw)")
path_flag = 'CUDNN_CONVOLUTION_WEIGHT_GRAD'
conv_op = 'cudnnConvolutionBackwardFilter'
algo = ""
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, desc = inp img, top, desc, h, w = inp
kerns, = grads kerns, = grads
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, desc) d_img = GpuDnnConvGradI()(kerns, top, desc,
img.shape[2], img.shape[3])
d_top = GpuDnnConv()(img, kerns, desc) d_top = GpuDnnConv()(img, kerns, desc)
return d_img, d_top, theano.gradient.DisconnectedType()() return (d_img, d_top, DisconnectedType()(), DisconnectedType()(),
DisconnectedType()())
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc, h, w
return [[1], [1], [0]] return [[1], [1], [0], [0], [0]]
def make_node(self, img, topgrad, desc): def make_node(self, img, topgrad, desc, h, w):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
if img.type.ndim != 4: if img.type.ndim != 4:
...@@ -625,14 +413,18 @@ class GpuDnnConvGradW(GpuDnnConvBase): ...@@ -625,14 +413,18 @@ class GpuDnnConvGradW(GpuDnnConvBase):
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
h = as_scalar(h)
w = as_scalar(w)
broadcastable = [topgrad.type.broadcastable[1], broadcastable = [topgrad.type.broadcastable[1],
img.type.broadcastable[1], img.type.broadcastable[1],
False, False] False, False]
return Apply(self, [img, topgrad, desc],
return Apply(self, [img, topgrad, desc, h, w],
[CudaNdarrayType(broadcastable)()]) [CudaNdarrayType(broadcastable)()])
class GpuDnnConvGradI(GpuDnnConvBase): class GpuDnnConvGradI(DnnBase, COp):
""" """
The convolution gradient with respect to the inputs. The convolution gradient with respect to the inputs.
...@@ -641,30 +433,29 @@ class GpuDnnConvGradI(GpuDnnConvBase): ...@@ -641,30 +433,29 @@ class GpuDnnConvGradI(GpuDnnConvBase):
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = ()
conv_inputs = 'kerns', 'output', def __init__(self):
conv_output = 'input' COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
conv_types = 'filter', 'tensor4d', 'tensor4d' "APPLY_SPECIFIC(conv_gi)")
path_flag = 'CUDNN_CONVOLUTION_DATA_GRAD'
conv_op = 'cudnnConvolutionBackwardData'
algo = ""
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, desc = inp kerns, top, desc, h, w = inp
img, = grads img, = grads
img = gpu_contiguous(img) img = gpu_contiguous(img)
d_kerns = GpuDnnConvGradW()(img, top, desc) d_kerns = GpuDnnConvGradW()(img, top, desc,
kerns.shape[2], kerns.shape[3])
d_top = GpuDnnConv()(img, kerns, desc) d_top = GpuDnnConv()(img, kerns, desc)
d_height_width = (DisconnectedType()(),) * 2 if len(inp) == 5 else () return (d_kerns, d_top, DisconnectedType()(), DisconnectedType()(),
return (d_kerns, d_top, DisconnectedType()()) + d_height_width DisconnectedType()())
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc, h, w
return [[1], [1], [0]] return [[1], [1], [0], [0], [0]]
def make_node(self, kern, topgrad, desc, shape=None): def make_node(self, kern, topgrad, desc, h, w):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
if kern.type.ndim != 4: if kern.type.ndim != 4:
...@@ -675,18 +466,15 @@ class GpuDnnConvGradI(GpuDnnConvBase): ...@@ -675,18 +466,15 @@ class GpuDnnConvGradI(GpuDnnConvBase):
if not isinstance(desc.type, CDataType) \ if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
if shape is None:
if not (desc.owner and h = as_scalar(h)
isinstance(desc.owner.op, GpuDnnConvDesc) and w = as_scalar(w)
desc.owner.op.subsample == (1, 1)):
raise ValueError('shape must be given if subsample != (1, 1)')
height_width = []
else:
height_width = [shape[0], shape[1]]
broadcastable = [topgrad.type.broadcastable[0], broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1], kern.type.broadcastable[1],
False, False] False, False]
return Apply(self, [kern, topgrad, desc] + height_width,
return Apply(self, [kern, topgrad, desc, h, w],
[CudaNdarrayType(broadcastable)()]) [CudaNdarrayType(broadcastable)()])
...@@ -733,7 +521,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -733,7 +521,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
img.shape[3] - kerns.shape[3] + 1) img.shape[3] - kerns.shape[3] + 1)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, shape) conv_mode='cross')(img.shape, shape)
conv = GpuDnnConvGradW()(img, kerns, desc) conv = GpuDnnConvGradW()(img, kerns, desc, shape[2], shape[3])
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3)) return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
elif (border_mode == 'full' and subsample == (1, 1) and elif (border_mode == 'full' and subsample == (1, 1) and
...@@ -749,7 +537,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -749,7 +537,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
img.shape[3] + kerns.shape[3] - 1) img.shape[3] + kerns.shape[3] - 1)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(shape, kerns.shape) conv_mode=conv_mode)(shape, kerns.shape)
return GpuDnnConvGradI()(kerns, img, desc) return GpuDnnConvGradI()(kerns, img, desc, shape[2], shape[3])
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
img = gpu_contiguous(img) img = gpu_contiguous(img)
...@@ -1302,8 +1090,7 @@ if (%(algo)d == 1) ...@@ -1302,8 +1090,7 @@ if (%(algo)d == 1)
cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL; cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
if (%(mode)d == 1) if (%(mode)d == 1)
mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE; mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
""" % dict(name=name, """ % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
tensor_format=tensor_format, mode=mode, algo=algo)
# Validate the input and build the input variables. # Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs): for input_idx, input_name in enumerate(self.softmax_inputs):
...@@ -1366,13 +1153,13 @@ const float alpha = 1.; ...@@ -1366,13 +1153,13 @@ const float alpha = 1.;
const float beta = 0.; const float beta = 0.;
err%(name)s = cudnnSoftmaxForward( err%(name)s = cudnnSoftmaxForward(
_handle, _handle,
algo%(id)d, algo%(name)s,
mode%(id)d, mode%(name)s,
(void*) &alpha, (void*) &alpha,
softmax_input_%(id)d, softmax_input_%(name)s,
CudaNdarray_DEV_DATA(%(ins)s), CudaNdarray_DEV_DATA(%(ins)s),
(void*) &beta, (void*) &beta,
softmax_output_%(id)d, softmax_output_%(name)s,
CudaNdarray_DEV_DATA(%(outs)s) CudaNdarray_DEV_DATA(%(outs)s)
); );
} }
...@@ -1420,15 +1207,15 @@ const float alpha = 1.; ...@@ -1420,15 +1207,15 @@ const float alpha = 1.;
const float beta = 0.; const float beta = 0.;
err%(name)s = cudnnSoftmaxBackward( err%(name)s = cudnnSoftmaxBackward(
_handle, _handle,
algo%(id)d, algo%(name)s,
mode%(id)d, mode%(name)s,
(void*) &alpha, (void*) &alpha,
%(name1)s_%(id)d, %(name1)s_%(name)s,
CudaNdarray_DEV_DATA(%(ins1)s), CudaNdarray_DEV_DATA(%(ins1)s),
%(name0)s_%(id)d, %(name0)s_%(name)s,
CudaNdarray_DEV_DATA(%(ins0)s), CudaNdarray_DEV_DATA(%(ins0)s),
(void*) &beta, (void*) &beta,
softmax_output_%(id)d, softmax_output_%(name)s,
CudaNdarray_DEV_DATA(%(outs)s) CudaNdarray_DEV_DATA(%(outs)s)
); );
} }
......
#section support_code
static cudnnHandle_t _handle = NULL;
static int
c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
desc, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(var)[0],
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s"
"shapes=%d %d %d %d strides=%d %d %d %d",
cudnnGetErrorString(err),
CudaNdarray_HOST_DIMS(var)[0],
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
);
return -1;
}
return 0;
}
static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
cudnnStatus_t err = cudnnSetFilter4dDescriptor(
desc, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(var)[0],
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3]
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s."
" dims= %d %d %d %d",
cudnnGetErrorString(err),
CudaNdarray_HOST_DIMS(var)[0],
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3]);
return -1;
}
return 0;
}
#section init_code
{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
#if PYTHON_MAJOR_VERSION >= 3
return NULL;
#else
return;
#endif
}
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code_struct
int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
cudnnConvolutionDescriptor_t desc,
CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
{
int out_dims[4];
err = cudnnGetConvolution2dForwardOutputDim(
desc,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
&out_dims[0], &out_dims[1], &out_dims[2], &out_dims[3]);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error while computing the output shape: %s",
cudnnGetErrorString(err));
return 1;
}
if (CudaNdarray_prep_output(output, 4, out_dims) != 0) {
return 1;
}
}
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
{
cudnnConvolutionFwdAlgo_t algo;
err = cudnnGetConvolutionForwardAlgorithm(
_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, // TODO: add op param
0,
&algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: Couldn't select convolution algorithm: %s",
cudnnGetErrorString(err));
return 1;
}
const float alpha = 1;
const float beta = 0;
err = cudnnConvolutionForward(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
desc,
algo,
NULL, 0,
(void *)&beta,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
cudnnConvolutionDescriptor_t desc,
int h, int w,
CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
{
int out_dims[4];
out_dims[0] = CudaNdarray_HOST_DIMS(output)[0];
out_dims[1] = CudaNdarray_HOST_DIMS(kerns)[1];
out_dims[2] = h;
out_dims[3] = w;
if (CudaNdarray_prep_output(input, 4, out_dims) != 0) {
return 1;
}
}
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
{
const float alpha = 1;
const float beta = 0;
err = cudnnConvolutionBackwardData(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
cudnnConvolutionDescriptor_t desc,
int h, int w,
CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1;
{
int out_dims[4];
out_dims[0] = CudaNdarray_HOST_DIMS(output)[1];
out_dims[1] = CudaNdarray_HOST_DIMS(input)[1];
out_dims[2] = h;
out_dims[3] = w;
if (CudaNdarray_prep_output(kerns, 4, out_dims) != 0) {
return 1;
}
}
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
{
const float alpha = 1;
const float beta = 0;
err = cudnnConvolutionBackwardFilter(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
(void *)&beta,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -27,7 +27,7 @@ from theano.sandbox import cuda ...@@ -27,7 +27,7 @@ from theano.sandbox import cuda
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvBase, dnn_conv from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
...@@ -596,7 +596,7 @@ def test_gemm_valid(): ...@@ -596,7 +596,7 @@ def test_gemm_valid():
def test_dnn_valid(): def test_dnn_valid():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_valid(GpuDnnConvBase, mode=theano_mode.including("cudnn")): for t in _test_valid(DnnBase, mode=theano_mode.including("cudnn")):
yield t yield t
...@@ -710,7 +710,7 @@ def test_gemm_full(): ...@@ -710,7 +710,7 @@ def test_gemm_full():
def test_dnn_full(): def test_dnn_full():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_full(GpuDnnConvBase, mode=theano_mode.including("cudnn")): for t in _test_full(DnnBase, mode=theano_mode.including("cudnn")):
yield t yield t
...@@ -762,13 +762,13 @@ def test_gemm_subsample(): ...@@ -762,13 +762,13 @@ def test_gemm_subsample():
def test_dnn_subsample(): def test_dnn_subsample():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_subsample(GpuDnnConvBase, theano_mode.including('cudnn')): for t in _test_subsample(DnnBase, theano_mode.including('cudnn')):
yield t yield t
class TestConv2DGPU(unittest.TestCase): class TestConv2DGPU(unittest.TestCase):
conv_ops = (cuda.blas.GpuConv, conv_ops = (cuda.blas.GpuConv,
cuda.dnn.GpuDnnConvBase, cuda.dnn.DnnBase,
cuda.blas.BaseGpuCorrMM) cuda.blas.BaseGpuCorrMM)
def test_logical_shapes(self): def test_logical_shapes(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论