提交 6117f98b authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: --global

add cudnnv3 conv3d

上级 34223240
差异被折叠。
......@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
return 0;
}
static int
c_set_tensorNd(CudaNdarray *var, int dim, cudnnTensorDescriptor_t desc) {
int strides[dim];
for (int i = 0; i < dim; ++i)
{
if (CudaNdarray_HOST_STRIDES(var)[i])
strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
else
{
strides[i] = 1;
for (int j = i + 1; j < dim; ++j)
strides[i] *= CudaNdarray_HOST_DIMS(var)[j];
}
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var),
strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s"
"dim=%d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int
c_set_filterNd(CudaNdarray *var, int dim, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s."
" dims= %d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
......
......@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
and the algorithms, if any, that were selected according to these dimensions
and according to the amount of memory available at that time.
*/
int APPLY_SPECIFIC(previous_input_shape)[4];
int APPLY_SPECIFIC(previous_kerns_shape)[4];
int APPLY_SPECIFIC(previous_output_shape)[4];
int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5];
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
......@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
......@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns)))
FAIL;
}
for (int i = 0; i < 4; i++)
for (int i = 0; i < 5; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
......
......@@ -3,7 +3,8 @@
int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **output) {
float alpha, float beta, int nb_dim, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
......@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1;
/* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1;
#endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_tensorNd(*output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1;
/* if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
{
size_t worksize;
void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo;
for (int i = 0; (i < nb_dim); i++)
std::cout << i << "/" << nb_dim << ", "
<< CudaNdarray_HOST_DIMS(input)[i] << ", "
<< CudaNdarray_HOST_DIMS(kerns)[i] << std::endl;
if (CHOOSE_ALGO)
{
// Check if the input and the kernels have the same shape as they have
// last time the apply node was executed
bool same_shapes = true;
for (int i = 0; (i < 4) && same_shapes; i++)
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
APPLY_SPECIFIC(previous_input_shape)[i]);
......@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_algo) = chosen_algo;
for (int i = 0; i < 4; i++)
for (int i = 0; i < nb_dim; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i];
......@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// can't.
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
// Following code is 2d-specific, but it is fine as ftt is define only for 2d-filters
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
......@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here" << std::endl;
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err));
cudnnGetErrorString(err));
return 1;
}
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
......@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
}
if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here2" << std::endl;
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **input) {
float alpha, float beta, int nb_dim, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
......@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
return 1;
}
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
/* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1;
#ifdef CONV_INPLACE
......@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
*input = im;
Py_INCREF(*input);
#else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
/* if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
if (c_set_tensorNd(*input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1;
{
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **kerns) {
float alpha, float beta, int nb_dim, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
......@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
/* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1;
#ifdef CONV_INPLACE
......@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
*kerns = km;
Py_INCREF(*kerns);
#else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1;
#endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
/* if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_filterNd(*kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1;
{
......
......@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
......@@ -763,6 +764,58 @@ def test_dnn_conv_grad():
utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_conv3d_valid():
print dnn.version()
if not cuda.dnn.dnn_available():
raise SkipTest('"3D conv not supported in cudnn v1')
def run_conv3d_valid(inputs_shape, filters_shape,
subsample=(1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
conv_ref = theano.tensor.nnet.conv3D(V=inputs.dimshuffle(0, 2, 3, 4, 1),
W=filters.dimshuffle(0, 2, 3, 4, 1),
b=bias, d=subsample)
conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
border_mode="valid", subsample=subsample, conv_mode='cross')
f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3))
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
print res_ref.shape, res.shape
utt.assert_allclose(res_ref, res)
run_conv3d_valid(inputs_shape=(128, 3, 5, 5, 5),
filters_shape=(64, 3, 1, 2, 4))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 2, 20, 12, 15),
filters_shape=(10, 2, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 2, 1))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(1, 2, 3))
def test_version():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论