Add grid of coordinates as output of GpuDnnTransformer

Signed-off-by: João Victor Tozatti Risso <joaovictor.risso@gmail.com>

Add grid of coordinates as output of GpuDnnTransformer
975a9b7b · João Victor Tozatti Risso · c99cc112 · 975a9b7b · 975a9b7b
--- a/theano/gpuarray/c_code/dnn_sptf.c
+++ b/theano/gpuarray/c_code/dnn_sptf.c
 #section support_code

 typedef struct __spatialtf_context {
-    PyGpuArrayObject * grid;
    cudnnTensorDescriptor_t xdesc;
    cudnnTensorDescriptor_t ydesc;
 } spatialtf_context_t;
@@ -11,15 +10,12 @@ void spatialtf_context_init( spatialtf_context_t * ctx )
    if ( ctx == NULL )
        return;

-    ctx->grid = NULL;
    ctx->xdesc = NULL;
    ctx->ydesc = NULL;
 }

 void spatialtf_context_destroy( spatialtf_context_t * ctx )
 {
-    Py_XDECREF( ctx->grid );
-
    if ( NULL != ctx->xdesc )
        cudnnDestroyTensorDescriptor( ctx->xdesc );

@@ -36,6 +32,7 @@ dnn_sptf(PyGpuArrayObject * input,
         cudnnSpatialTransformerDescriptor_t desc,
         double alpha, double beta,
         PyGpuArrayObject ** output,
+         PyGpuArrayObject ** grid,
         cudnnHandle_t _handle)
 {
    PyGpuContextObject * gpu_ctx = input->context;
@@ -130,10 +127,8 @@ dnn_sptf(PyGpuArrayObject * input,

    cuda_enter( gpu_ctx->ctx );

-    spatialtf_ctx.grid = pygpu_empty(4, &(gpu_grid_dims[0]), input->ga.typecode, GA_C_ORDER,
-        gpu_ctx, Py_None);
-
-    if ( spatialtf_ctx.grid == NULL )
+    if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
    {
        PyErr_SetString( PyExc_RuntimeError,
                         "GpuDnnTransformer: could not allocate memory for grid of coordinates" );
@@ -225,10 +220,11 @@ dnn_sptf(PyGpuArrayObject * input,

    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
    cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );

    err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
-        PyGpuArray_DEV_DATA( spatialtf_ctx.grid ) );
+        PyGpuArray_DEV_DATA( *grid ) );

    if ( CUDNN_STATUS_SUCCESS != err )
    {
@@ -239,11 +235,12 @@ dnn_sptf(PyGpuArrayObject * input,
    }

    err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, spatialtf_ctx.xdesc,
-        PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( spatialtf_ctx.grid ),
-        beta_p, spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) );
+        PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p,
+        spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) );

    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
    cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );

    if ( CUDNN_STATUS_SUCCESS != err )

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2893,8 +2893,9 @@ class GpuDnnTransformer(DnnBase):

    __props__ = ('dtype',)
    _cop_num_inputs = 6
-    _cop_num_outputs = 1
+    _cop_num_outputs = 2
    _f16_ok = True
+    default_output = 0

    def __init__(self, dtype):
        DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf")
@@ -2912,6 +2913,9 @@ class GpuDnnTransformer(DnnBase):
        output = GpuArrayType(dtype=self.dtype,
                              broadcastable=img.type.ndim * (False,),
                              context_name=context_name)()
+        grid = GpuArrayType(dtype=self.dtype,
+                            broadcastable=img.type.ndim * (False,),
+                            context_name=context_name)()

        if img.type.ndim != 4:
            raise TypeError('img must be a 4D tensor')
@@ -2923,7 +2927,9 @@ class GpuDnnTransformer(DnnBase):
        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
        beta = ensure_dt(beta, _zero, 'beta', img.dtype)

-        return Apply(self, [img, theta, grid_dims, desc, alpha, beta], [output])
+        inputs = [img, theta, grid_dims, desc, alpha, beta]
+        outputs = [output, grid]
+        return Apply(self, inputs, outputs)

    def L_op(self, inputs, outputs, grads):
        pass