提交 6117f98b authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: --global

add cudnnv3 conv3d

上级 34223240
差异被折叠。
...@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) { ...@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
return 0; return 0;
} }
static int
c_set_tensorNd(CudaNdarray *var, int dim, cudnnTensorDescriptor_t desc) {
int strides[dim];
for (int i = 0; i < dim; ++i)
{
if (CudaNdarray_HOST_STRIDES(var)[i])
strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
else
{
strides[i] = 1;
for (int j = i + 1; j < dim; ++j)
strides[i] *= CudaNdarray_HOST_DIMS(var)[j];
}
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var),
strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s"
"dim=%d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int
c_set_filterNd(CudaNdarray *var, int dim, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s."
" dims= %d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) { c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) { if (!CudaNdarray_is_c_contiguous(var)) {
......
...@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); ...@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
and the algorithms, if any, that were selected according to these dimensions and the algorithms, if any, that were selected according to these dimensions
and according to the amount of memory available at that time. and according to the amount of memory available at that time.
*/ */
int APPLY_SPECIFIC(previous_input_shape)[4]; int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[4]; int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[4]; int APPLY_SPECIFIC(previous_output_shape)[5];
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo); cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo); cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo); cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
...@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL; ...@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL; APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL; APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
...@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) ...@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns)))
FAIL; FAIL;
} }
for (int i = 0; i < 4; i++) for (int i = 0; i < 5; i++)
{ {
APPLY_SPECIFIC(previous_input_shape)[i] = 0; APPLY_SPECIFIC(previous_input_shape)[i] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[i] = 0; APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
......
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
int int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc, CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **output) { float alpha, float beta, int nb_dim, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
/* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
Py_XDECREF(*output); Py_XDECREF(*output);
*output = om; *output = om;
Py_INCREF(*output); Py_INCREF(*output);
#else #else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0) if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
/* if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
{ {
size_t worksize; size_t worksize;
void *workspace; void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo; cudnnConvolutionFwdAlgo_t chosen_algo;
for (int i = 0; (i < nb_dim); i++)
std::cout << i << "/" << nb_dim << ", "
<< CudaNdarray_HOST_DIMS(input)[i] << ", "
<< CudaNdarray_HOST_DIMS(kerns)[i] << std::endl;
if (CHOOSE_ALGO) if (CHOOSE_ALGO)
{ {
// Check if the input and the kernels have the same shape as they have // Check if the input and the kernels have the same shape as they have
// last time the apply node was executed // last time the apply node was executed
bool same_shapes = true; bool same_shapes = true;
for (int i = 0; (i < 4) && same_shapes; i++) for (int i = 0; (i < nb_dim) && same_shapes; i++)
{ {
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] != same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
APPLY_SPECIFIC(previous_input_shape)[i]); APPLY_SPECIFIC(previous_input_shape)[i]);
...@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// Store the shapes of the inputs and kernels as well as the chosen // Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use. // algorithm for future use.
APPLY_SPECIFIC(previous_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo) = chosen_algo;
for (int i = 0; i < 4; i++) for (int i = 0; i < nb_dim; i++)
{ {
APPLY_SPECIFIC(previous_input_shape)[i] = APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i]; CudaNdarray_HOST_DIMS(input)[i];
...@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// If the chosen implementation is FFT, validate that it can be used // If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it // on the current data and default on a safe implementation if it
// can't. // can't.
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) // Following code is 2d-specific, but it is fine as ftt is define only for 2d-filters
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
{ {
// Extract the properties of the convolution descriptor // Extract the properties of the convolution descriptor
...@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo, chosen_algo,
&worksize); &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here" << std::endl;
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s", "GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
workspace = get_work_mem(worksize); workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0) if (workspace == NULL && worksize != 0)
return 1; return 1;
...@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output)); APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here2" << std::endl;
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc, CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **input) { float alpha, float beta, int nb_dim, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
return 1; return 1;
} }
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) /* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
...@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
*input = im; *input = im;
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0) if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) /* if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
if (c_set_tensorNd(*input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
{ {
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc, CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **kerns) { float alpha, float beta, int nb_dim, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) { if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) /* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
...@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
*kerns = km; *kerns = km;
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else #else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0) if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1; return 1;
#endif #endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) /* if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_filterNd(*kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
{ {
......
...@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d ...@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
import theano.sandbox.cuda.dnn as dnn import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
...@@ -763,6 +764,58 @@ def test_dnn_conv_grad(): ...@@ -763,6 +764,58 @@ def test_dnn_conv_grad():
utt.verify_grad(dconvw, [img_val, kern_val, out_val]) utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_conv3d_valid():
print dnn.version()
if not cuda.dnn.dnn_available():
raise SkipTest('"3D conv not supported in cudnn v1')
def run_conv3d_valid(inputs_shape, filters_shape,
subsample=(1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
conv_ref = theano.tensor.nnet.conv3D(V=inputs.dimshuffle(0, 2, 3, 4, 1),
W=filters.dimshuffle(0, 2, 3, 4, 1),
b=bias, d=subsample)
conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
border_mode="valid", subsample=subsample, conv_mode='cross')
f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3))
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
print res_ref.shape, res.shape
utt.assert_allclose(res_ref, res)
run_conv3d_valid(inputs_shape=(128, 3, 5, 5, 5),
filters_shape=(64, 3, 1, 2, 4))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 2, 20, 12, 15),
filters_shape=(10, 2, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 2, 1))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(1, 2, 3))
def test_version(): def test_version():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论