Refactor spatial transformer C implementation to use helper functions

上级 a203ad71
#section support_code #section support_code_struct
typedef struct __spatialtf_context { cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
cudnnTensorDescriptor_t xdesc; cudnnTensorDescriptor_t APPLY_SPECIFIC(ydesc);
cudnnTensorDescriptor_t ydesc;
} spatialtf_context_t;
void spatialtf_context_init( spatialtf_context_t * ctx ) #section init_code_struct
{
if ( ctx == NULL )
return;
ctx->xdesc = NULL; APPLY_SPECIFIC(xdesc) = NULL;
ctx->ydesc = NULL; APPLY_SPECIFIC(ydesc) = NULL;
}
void spatialtf_context_destroy( spatialtf_context_t * ctx )
{ {
if ( NULL != ctx->xdesc ) cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
cudnnDestroyTensorDescriptor( ctx->xdesc ); err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
if ( NULL != ctx->ydesc ) err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(ydesc) );
cudnnDestroyTensorDescriptor( ctx->ydesc ); if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor ydesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
} }
#section cleanup_code_struct
if ( APPLY_SPECIFIC(xdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
if ( APPLY_SPECIFIC(ydesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(ydesc) );
#section support_code_struct #section support_code_struct
int int
dnn_sptf(PyGpuArrayObject * input, APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
PyGpuArrayObject * theta, PyGpuArrayObject * theta,
PyArrayObject * grid_dims, PyArrayObject * grid_dims,
cudnnSpatialTransformerDescriptor_t desc, cudnnSpatialTransformerDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject ** output, PyGpuArrayObject ** output,
PyGpuArrayObject ** grid, PyGpuArrayObject ** grid,
cudnnHandle_t _handle) cudnnHandle_t _handle)
{ {
PyGpuContextObject * gpu_ctx = input->context; PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p; void * alpha_p;
void * beta_p; void * beta_p;
float af = alpha, bf = beta; float af = alpha, bf = beta;
spatialtf_context_t spatialtf_ctx;
cudnnDataType_t dt;
cudnnTensorFormat_t tf = CUDNN_TENSOR_NCHW;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
int num_images, num_channels, height, width;
size_t gpu_grid_dims[4], out_dims[4];
switch (input->ga.typecode) switch (input->ga.typecode)
{ {
case GA_DOUBLE: case GA_DOUBLE:
alpha_p = (void *)α alpha_p = (void *)α
beta_p = (void *)β beta_p = (void *)β
dt = CUDNN_DATA_DOUBLE;
break; break;
case GA_FLOAT: case GA_FLOAT:
alpha_p = (void *)⁡ alpha_p = (void *)⁡
beta_p = (void *)&bf; beta_p = (void *)&bf;
dt = CUDNN_DATA_FLOAT;
break; break;
case GA_HALF: case GA_HALF:
alpha_p = (void *)⁡ alpha_p = (void *)⁡
beta_p = (void *)&bf; beta_p = (void *)&bf;
dt = CUDNN_DATA_HALF;
break; break;
default: default:
PyErr_SetString( PyExc_TypeError, PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported type for input in spatial transformer." ); "GpuDnnTransformer: unsupported type for input in spatial transformer." );
return -1; return 1;
}
if ( ! GpuArray_IS_C_CONTIGUOUS( &(input->ga) ) )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: input data is not C-contiguous." );
return -1;
} }
if ( theta->ga.typecode != GA_FLOAT && if ( theta->ga.typecode != GA_FLOAT &&
...@@ -80,13 +83,7 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -80,13 +83,7 @@ dnn_sptf(PyGpuArrayObject * input,
{ {
PyErr_SetString( PyExc_TypeError, PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported data type for theta in spatial transformer." ); "GpuDnnTransformer: unsupported data type for theta in spatial transformer." );
return -1; return 1;
}
else if ( PyGpuArray_NDIM( theta ) != 3 )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: theta must have three dimensions!" );
return -1;
} }
else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 ) else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
{ {
...@@ -94,131 +91,78 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -94,131 +91,78 @@ dnn_sptf(PyGpuArrayObject * input,
"GpuDnnTransformer: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)", "GpuDnnTransformer: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0], PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] ); PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
return -1; return 1;
}
else if ( ! GpuArray_IS_C_CONTIGUOUS( &(theta->ga) ) )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: theta is not C-contiguous" );
return -1;
} }
if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 ) if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 )
{ {
PyErr_SetString( PyExc_RuntimeError, PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: grid_dims must have 4 elements." ); "GpuDnnTransformer: grid_dims must have 4 elements." );
return -1; return 1;
} }
// Obtain grid dimensions // Obtain grid dimensions
const int num_images = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 0 ) ); num_images = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 0 ) );
const int num_channels = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 1 ) ); num_channels = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 1 ) );
const int height = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 2 ) ); height = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 2 ) );
const int width = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 3 ) ); width = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 3 ) );
const size_t gpu_grid_dims[4] = { num_images, height, width, 2 };
if ( width == 0 || height == 0 || num_images == 0 ) gpu_grid_dims[0] = num_images;
{ gpu_grid_dims[1] = height;
PyErr_SetString( PyExc_RuntimeError, gpu_grid_dims[2] = width;
"GpuDnnTransformer: grid_dims has a dimension with value zero" ); gpu_grid_dims[3] = 2;
return -1;
}
spatialtf_context_init( &spatialtf_ctx ); out_dims[0] = num_images;
out_dims[1] = num_channels;
out_dims[2] = height;
out_dims[3] = width;
cuda_enter( gpu_ctx->ctx ); if ( width == 0 || height == 0 || num_images == 0 )
if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{ {
PyErr_SetString( PyExc_RuntimeError, PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: could not allocate memory for grid of coordinates" ); "GpuDnnTransformer: grid_dims has a dimension with value zero" );
return -1; return 1;
}
err = cudnnCreateTensorDescriptor( &(spatialtf_ctx.xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: could not create xdesc: %s",
cudnnGetErrorString(err) );
return -1;
} }
// In the input tensor, we must use its width and height, instead if ( PyGpuArray_DIM( input, 0 ) != num_images )
// of the grid's width and height. The number of images and channels
// should be the same as the grid dimensions
const int input_num_images = (int) PyGpuArray_DIM( input, 0 );
const int input_num_channels = (int) PyGpuArray_DIM( input, 1 );
const int input_height = (int) PyGpuArray_DIM( input, 2 );
const int input_width = (int) PyGpuArray_DIM( input, 3 );
if ( input_num_images != num_images || input_num_channels != num_channels )
{ {
PyErr_Format( PyExc_RuntimeError, PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: expected input to have %d inputs, got %d inputs.", "GpuDnnTransformer: expected input to have %d inputs, got %d inputs.",
num_images, input_num_images ); num_images, PyGpuArray_DIM( input, 0 ) );
return -1; return 1;
} }
else if ( PyGpuArray_DIM( input, 1 ) != num_channels )
err = cudnnSetTensor4dDescriptor( spatialtf_ctx.xdesc, tf, dt, num_images,
input_num_channels, input_height, input_width );
if ( err != CUDNN_STATUS_SUCCESS )
{ {
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError, PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to initialize xdesc: %s", "GpuDnnTransformer: expected input to have %d channels, got %d channels.",
cudnnGetErrorString(err) ); num_channels, PyGpuArray_DIM( input, 1 ) );
return -1; return 1;
} }
err = cudnnCreateTensorDescriptor( &(spatialtf_ctx.ydesc) ); if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
if ( err != CUDNN_STATUS_SUCCESS )
{
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to create ydesc: %s",
cudnnGetErrorString(err) );
return -1;
}
err = cudnnSetTensor4dDescriptor( spatialtf_ctx.ydesc, tf, dt, num_images,
input_num_channels, height, width );
if ( err != CUDNN_STATUS_SUCCESS )
{ {
spatialtf_context_destroy( &spatialtf_ctx ); PyErr_SetString( PyExc_RuntimeError,
cuda_exit( gpu_ctx->ctx ); "GpuDnnTransformer: could not allocate memory for grid of coordinates" );
return 1;
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to initialize ydesc: %s",
cudnnGetErrorString(err) );
return -1;
} }
const size_t out_dims[4] = { num_images, input_num_channels, height, width };
if ( theano_prep_output( output, 4, out_dims, input->ga.typecode, if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 ) GA_C_ORDER, gpu_ctx ) != 0 )
{ {
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
PyErr_SetString( PyExc_MemoryError, PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: could not allocate memory for grid sampler" ); "GpuDnnTransformer: could not allocate memory for grid sampler" );
return -1; return 1;
} }
if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
return 1;
if ( c_set_tensorNd( *output, APPLY_SPECIFIC(ydesc) ) != 0 )
return 1;
cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
...@@ -230,32 +174,30 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -230,32 +174,30 @@ dnn_sptf(PyGpuArrayObject * input,
if ( CUDNN_STATUS_SUCCESS != err ) if ( CUDNN_STATUS_SUCCESS != err )
{ {
PyErr_Format( PyExc_RuntimeError, PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to create grid of coordinates: %s", "GpuDnnTransformer: could not create grid of coordinates: %s",
cudnnGetErrorString( err ) ); cudnnGetErrorString( err ) );
return -1; cuda_exit( gpu_ctx->ctx );
return 1;
} }
err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, spatialtf_ctx.xdesc, err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, APPLY_SPECIFIC(xdesc),
PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p, PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p,
spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) ); APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err ) if ( CUDNN_STATUS_SUCCESS != err )
{ {
PyErr_Format( PyExc_RuntimeError, PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: failed to create grid sampler: %s", "GpuDnnTransformer: could not create grid sampler: %s",
cudnnGetErrorString( err ) ); cudnnGetErrorString( err ) );
spatialtf_context_destroy( &spatialtf_ctx ); return 1;
cuda_exit( gpu_ctx->ctx );
return -1;
} }
spatialtf_context_destroy( &spatialtf_ctx );
cuda_exit( gpu_ctx->ctx );
return 0; return 0;
} }
...@@ -2898,7 +2898,7 @@ class GpuDnnTransformer(DnnBase): ...@@ -2898,7 +2898,7 @@ class GpuDnnTransformer(DnnBase):
default_output = 0 default_output = 0
def __init__(self, dtype): def __init__(self, dtype):
DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf") DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "APPLY_SPECIFIC(dnn_sptf)")
self.dtype = dtype self.dtype = dtype
def make_node(self, img, theta, output, grid_dims, desc, alpha=None, beta=None): def make_node(self, img, theta, output, grid_dims, desc, alpha=None, beta=None):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论