Add grid of coordinates as output of GpuDnnTransformer

上级 c99cc112
#section support_code #section support_code
typedef struct __spatialtf_context { typedef struct __spatialtf_context {
PyGpuArrayObject * grid;
cudnnTensorDescriptor_t xdesc; cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc; cudnnTensorDescriptor_t ydesc;
} spatialtf_context_t; } spatialtf_context_t;
...@@ -11,15 +10,12 @@ void spatialtf_context_init( spatialtf_context_t * ctx ) ...@@ -11,15 +10,12 @@ void spatialtf_context_init( spatialtf_context_t * ctx )
if ( ctx == NULL ) if ( ctx == NULL )
return; return;
ctx->grid = NULL;
ctx->xdesc = NULL; ctx->xdesc = NULL;
ctx->ydesc = NULL; ctx->ydesc = NULL;
} }
void spatialtf_context_destroy( spatialtf_context_t * ctx ) void spatialtf_context_destroy( spatialtf_context_t * ctx )
{ {
Py_XDECREF( ctx->grid );
if ( NULL != ctx->xdesc ) if ( NULL != ctx->xdesc )
cudnnDestroyTensorDescriptor( ctx->xdesc ); cudnnDestroyTensorDescriptor( ctx->xdesc );
...@@ -36,6 +32,7 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -36,6 +32,7 @@ dnn_sptf(PyGpuArrayObject * input,
cudnnSpatialTransformerDescriptor_t desc, cudnnSpatialTransformerDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject ** output, PyGpuArrayObject ** output,
PyGpuArrayObject ** grid,
cudnnHandle_t _handle) cudnnHandle_t _handle)
{ {
PyGpuContextObject * gpu_ctx = input->context; PyGpuContextObject * gpu_ctx = input->context;
...@@ -130,10 +127,8 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -130,10 +127,8 @@ dnn_sptf(PyGpuArrayObject * input,
cuda_enter( gpu_ctx->ctx ); cuda_enter( gpu_ctx->ctx );
spatialtf_ctx.grid = pygpu_empty(4, &(gpu_grid_dims[0]), input->ga.typecode, GA_C_ORDER, if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
gpu_ctx, Py_None); GA_C_ORDER, gpu_ctx ) != 0 )
if ( spatialtf_ctx.grid == NULL )
{ {
PyErr_SetString( PyExc_RuntimeError, PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: could not allocate memory for grid of coordinates" ); "GpuDnnTransformer: could not allocate memory for grid of coordinates" );
...@@ -225,10 +220,11 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -225,10 +220,11 @@ dnn_sptf(PyGpuArrayObject * input,
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ), err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
PyGpuArray_DEV_DATA( spatialtf_ctx.grid ) ); PyGpuArray_DEV_DATA( *grid ) );
if ( CUDNN_STATUS_SUCCESS != err ) if ( CUDNN_STATUS_SUCCESS != err )
{ {
...@@ -239,11 +235,12 @@ dnn_sptf(PyGpuArrayObject * input, ...@@ -239,11 +235,12 @@ dnn_sptf(PyGpuArrayObject * input,
} }
err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, spatialtf_ctx.xdesc, err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, spatialtf_ctx.xdesc,
PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( spatialtf_ctx.grid ), PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p,
beta_p, spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) ); spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
if ( CUDNN_STATUS_SUCCESS != err ) if ( CUDNN_STATUS_SUCCESS != err )
......
...@@ -2893,8 +2893,9 @@ class GpuDnnTransformer(DnnBase): ...@@ -2893,8 +2893,9 @@ class GpuDnnTransformer(DnnBase):
__props__ = ('dtype',) __props__ = ('dtype',)
_cop_num_inputs = 6 _cop_num_inputs = 6
_cop_num_outputs = 1 _cop_num_outputs = 2
_f16_ok = True _f16_ok = True
default_output = 0
def __init__(self, dtype): def __init__(self, dtype):
DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf") DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf")
...@@ -2912,6 +2913,9 @@ class GpuDnnTransformer(DnnBase): ...@@ -2912,6 +2913,9 @@ class GpuDnnTransformer(DnnBase):
output = GpuArrayType(dtype=self.dtype, output = GpuArrayType(dtype=self.dtype,
broadcastable=img.type.ndim * (False,), broadcastable=img.type.ndim * (False,),
context_name=context_name)() context_name=context_name)()
grid = GpuArrayType(dtype=self.dtype,
broadcastable=img.type.ndim * (False,),
context_name=context_name)()
if img.type.ndim != 4: if img.type.ndim != 4:
raise TypeError('img must be a 4D tensor') raise TypeError('img must be a 4D tensor')
...@@ -2923,7 +2927,9 @@ class GpuDnnTransformer(DnnBase): ...@@ -2923,7 +2927,9 @@ class GpuDnnTransformer(DnnBase):
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
beta = ensure_dt(beta, _zero, 'beta', img.dtype) beta = ensure_dt(beta, _zero, 'beta', img.dtype)
return Apply(self, [img, theta, grid_dims, desc, alpha, beta], [output]) inputs = [img, theta, grid_dims, desc, alpha, beta]
outputs = [output, grid]
return Apply(self, inputs, outputs)
def L_op(self, inputs, outputs, grads): def L_op(self, inputs, outputs, grads):
pass pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论