Refactor spatial transformer C implementation to use helper functions

上级 a203ad71
#section support_code
#section support_code_struct
typedef struct __spatialtf_context {
cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc;
} spatialtf_context_t;
cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(ydesc);
void spatialtf_context_init( spatialtf_context_t * ctx )
{
if ( ctx == NULL )
return;
#section init_code_struct
ctx->xdesc = NULL;
ctx->ydesc = NULL;
}
APPLY_SPECIFIC(xdesc) = NULL;
APPLY_SPECIFIC(ydesc) = NULL;
void spatialtf_context_destroy( spatialtf_context_t * ctx )
{
if ( NULL != ctx->xdesc )
cudnnDestroyTensorDescriptor( ctx->xdesc );
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
if ( NULL != ctx->ydesc )
cudnnDestroyTensorDescriptor( ctx->ydesc );
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(ydesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor ydesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
}
#section cleanup_code_struct
if ( APPLY_SPECIFIC(xdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
if ( APPLY_SPECIFIC(ydesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(ydesc) );
#section support_code_struct
int
dnn_sptf(PyGpuArrayObject * input,
PyGpuArrayObject * theta,
PyArrayObject * grid_dims,
cudnnSpatialTransformerDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject ** output,
PyGpuArrayObject ** grid,
cudnnHandle_t _handle)
APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
PyGpuArrayObject * theta,
PyArrayObject * grid_dims,
cudnnSpatialTransformerDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject ** output,
PyGpuArrayObject ** grid,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p;
void * beta_p;
float af = alpha, bf = beta;
spatialtf_context_t spatialtf_ctx;
cudnnDataType_t dt;
cudnnTensorFormat_t tf = CUDNN_TENSOR_NCHW;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
int num_images, num_channels, height, width;
size_t gpu_grid_dims[4], out_dims[4];
switch (input->ga.typecode)
{
case GA_DOUBLE:
alpha_p = (void *)α
beta_p = (void *)β
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
alpha_p = (void *)⁡
beta_p = (void *)&bf;
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
alpha_p = (void *)⁡
beta_p = (void *)&bf;
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported type for input in spatial transformer." );
return -1;
}
if ( ! GpuArray_IS_C_CONTIGUOUS( &(input->ga) ) )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: input data is not C-contiguous." );
return -1;
return 1;
}
if ( theta->ga.typecode != GA_FLOAT &&
......@@ -80,13 +83,7 @@ dnn_sptf(PyGpuArrayObject * input,
{
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported data type for theta in spatial transformer." );
return -1;
}
else if ( PyGpuArray_NDIM( theta ) != 3 )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: theta must have three dimensions!" );
return -1;
return 1;
}
else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
{
......@@ -94,131 +91,78 @@ dnn_sptf(PyGpuArrayObject * input,
"GpuDnnTransformer: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
return -1;
}
else if ( ! GpuArray_IS_C_CONTIGUOUS( &(theta->ga) ) )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: theta is not C-contiguous" );
return -1;
return 1;
}
if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 )
{
PyErr_SetString( PyExc_RuntimeError,
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: grid_dims must have 4 elements." );
return -1;
return 1;
}
// Obtain grid dimensions
const int num_images = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 0 ) );
const int num_channels = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 1 ) );
const int height = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 2 ) );
const int width = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 3 ) );
const size_t gpu_grid_dims[4] = { num_images, height, width, 2 };
num_images = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 0 ) );
num_channels = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 1 ) );
height = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 2 ) );
width = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 3 ) );
if ( width == 0 || height == 0 || num_images == 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: grid_dims has a dimension with value zero" );
return -1;
}
gpu_grid_dims[0] = num_images;
gpu_grid_dims[1] = height;
gpu_grid_dims[2] = width;
gpu_grid_dims[3] = 2;
spatialtf_context_init( &spatialtf_ctx );
out_dims[0] = num_images;
out_dims[1] = num_channels;
out_dims[2] = height;
out_dims[3] = width;
cuda_enter( gpu_ctx->ctx );
if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
if ( width == 0 || height == 0 || num_images == 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: could not allocate memory for grid of coordinates" );
return -1;
}
err = cudnnCreateTensorDescriptor( &(spatialtf_ctx.xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: could not create xdesc: %s",
cudnnGetErrorString(err) );
return -1;
"GpuDnnTransformer: grid_dims has a dimension with value zero" );
return 1;
}
// In the input tensor, we must use its width and height, instead
// of the grid's width and height. The number of images and channels
// should be the same as the grid dimensions
const int input_num_images = (int) PyGpuArray_DIM( input, 0 );
const int input_num_channels = (int) PyGpuArray_DIM( input, 1 );
const int input_height = (int) PyGpuArray_DIM( input, 2 );
const int input_width = (int) PyGpuArray_DIM( input, 3 );
if ( input_num_images != num_images || input_num_channels != num_channels )
if ( PyGpuArray_DIM( input, 0 ) != num_images )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: expected input to have %d inputs, got %d inputs.",
num_images, input_num_images );
return -1;
num_images, PyGpuArray_DIM( input, 0 ) );
return 1;
}
err = cudnnSetTensor4dDescriptor( spatialtf_ctx.xdesc, tf, dt, num_images,
input_num_channels, input_height, input_width );
if ( err != CUDNN_STATUS_SUCCESS )
else if ( PyGpuArray_DIM( input, 1 ) != num_channels )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to initialize xdesc: %s",
cudnnGetErrorString(err) );
return -1;
"GpuDnnTransformer: expected input to have %d channels, got %d channels.",
num_channels, PyGpuArray_DIM( input, 1 ) );
return 1;
}
err = cudnnCreateTensorDescriptor( &(spatialtf_ctx.ydesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to create ydesc: %s",
cudnnGetErrorString(err) );
return -1;
}
err = cudnnSetTensor4dDescriptor( spatialtf_ctx.ydesc, tf, dt, num_images,
input_num_channels, height, width );
if ( err != CUDNN_STATUS_SUCCESS )
if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to initialize ydesc: %s",
cudnnGetErrorString(err) );
return -1;
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: could not allocate memory for grid of coordinates" );
return 1;
}
const size_t out_dims[4] = { num_images, input_num_channels, height, width };
if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: could not allocate memory for grid sampler" );
return -1;
return 1;
}
if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
return 1;
if ( c_set_tensorNd( *output, APPLY_SPECIFIC(ydesc) ) != 0 )
return 1;
cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
......@@ -230,32 +174,30 @@ dnn_sptf(PyGpuArrayObject * input,
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to create grid of coordinates: %s",
"GpuDnnTransformer: could not create grid of coordinates: %s",
cudnnGetErrorString( err ) );
return -1;
cuda_exit( gpu_ctx->ctx );
return 1;
}
err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, spatialtf_ctx.xdesc,
err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, APPLY_SPECIFIC(xdesc),
PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p,
spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) );
APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to create grid sampler: %s",
"GpuDnnTransformer: could not create grid sampler: %s",
cudnnGetErrorString( err ) );
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
return -1;
return 1;
}
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
return 0;
}
......@@ -2898,7 +2898,7 @@ class GpuDnnTransformer(DnnBase):
default_output = 0
def __init__(self, dtype):
DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf")
DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "APPLY_SPECIFIC(dnn_sptf)")
self.dtype = dtype
def make_node(self, img, theta, output, grid_dims, desc, alpha=None, beta=None):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论