Merge pull request #6061 from joaovictortr/spatialtf_networks

Spatial Transformer Network Ops using cuDNN

Merge pull request #6061 from joaovictortr/spatialtf_networks
6d71e317 · Frédéric Bastien · GitHub · 92bf47f1 · e754ca98 · 6d71e317
--- a/doc/library/gpuarray/dnn.txt
+++ b/doc/library/gpuarray/dnn.txt
@@ -146,6 +146,8 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
 - Softmax:
    - You can manually use the op :class:`GpuDnnSoftmax
      <theano.gpuarray.dnn.GpuDnnSoftmax>` to use its extra feature.
+- Spatial Transformer:
+    - :func:`theano.gpuarray.dnn.dnn_spatialtf`.

 List of Implemented Operations
 ==============================

--- a/theano/gpuarray/c_code/dnn_sptf_gi.c
+++ b/theano/gpuarray/c_code/dnn_sptf_gi.c
+#section support_code_struct
+
+cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dxdesc);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dydesc);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(sptf) = NULL;
+APPLY_SPECIFIC(xdesc) = NULL;
+APPLY_SPECIFIC(dxdesc) = NULL;
+APPLY_SPECIFIC(dydesc) = NULL;
+
+{
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+    
+    err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
+    if (err != CUDNN_STATUS_SUCCESS)
+    {
+        PyErr_Format(PyExc_MemoryError,
+            "GpuDnnTransformerGradI: could not allocate spatial transformer descriptor (sptf): %s",
+            cudnnGetErrorString(err));
+        FAIL;
+    }
+
+    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
+    if ( err != CUDNN_STATUS_SUCCESS )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
+            cudnnGetErrorString( err ) );
+        FAIL;
+    }
+
+    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dxdesc) );
+    if ( err != CUDNN_STATUS_SUCCESS )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dxdesc: %s",
+            cudnnGetErrorString( err ) );
+        FAIL;
+    }
+
+    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dydesc) );
+    if ( err != CUDNN_STATUS_SUCCESS )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dydesc: %s",
+            cudnnGetErrorString( err ) );
+        FAIL;
+    }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(sptf) != NULL)
+    cudnnDestroySpatialTransformerDescriptor( APPLY_SPECIFIC(sptf) );
+
+if ( APPLY_SPECIFIC(xdesc) != NULL )
+    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
+
+if ( APPLY_SPECIFIC(dxdesc) != NULL )
+    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dxdesc) );
+
+if ( APPLY_SPECIFIC(dydesc) != NULL )
+    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dydesc) );
+
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(dnn_sptf_gi)(PyGpuArrayObject * input,
+                            PyGpuArrayObject * grid,
+                            PyGpuArrayObject * dy,
+                            PyGpuArrayObject ** input_grad,
+                            PyGpuArrayObject ** grid_grad,
+                            cudnnHandle_t _handle)
+{
+    PyGpuContextObject * gpu_ctx = input->context;
+    void * alpha_p;
+    void * beta_p;
+    double alpha = 1.0, beta = 0.0;
+    float af = alpha, bf = beta;
+    int out_dims[4];
+    cudnnDataType_t dt;
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+
+    switch (input->ga.typecode)
+    {
+    case GA_DOUBLE:
+        alpha_p = (void *)&alpha;
+        beta_p = (void *)&beta;
+        dt = CUDNN_DATA_DOUBLE;
+        break;
+    case GA_FLOAT:
+        alpha_p = (void *)&af;
+        beta_p = (void *)&bf;
+        dt = CUDNN_DATA_FLOAT;
+        break;
+    case GA_HALF:
+        alpha_p = (void *)&af;
+        beta_p = (void *)&bf;
+        dt = CUDNN_DATA_HALF;
+        break;
+    default:
+        PyErr_SetString( PyExc_TypeError,
+            "GpuDnnTransformerGradI: unsupported type for input in spatial transformer gradients" );
+        return 1;
+    }
+
+    if ( grid->ga.typecode != GA_FLOAT &&
+         grid->ga.typecode != GA_DOUBLE &&
+         grid->ga.typecode != GA_HALF )
+    {
+        PyErr_SetString( PyExc_TypeError,
+            "GpuDnnTransformerGradI: unsupported data type for grid in spatial transformer gradients." );
+        return 1;
+    }
+
+    if ( theano_prep_output( input_grad, PyGpuArray_NDIM( input ),
+                             PyGpuArray_DIMS( input ), input->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
+        return 1;
+
+    if ( theano_prep_output( grid_grad, PyGpuArray_NDIM( grid ),
+                             PyGpuArray_DIMS( grid ), grid->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
+        return 1;
+
+    // Obtain output dimensions to setup descriptor
+    out_dims[0] = (int) PyGpuArray_DIM(input, 0); // num_images
+    out_dims[1] = (int) PyGpuArray_DIM(input, 1); // num_channels
+    out_dims[2] = (int) PyGpuArray_DIM(grid, 1); // grid height
+    out_dims[3] = (int) PyGpuArray_DIM(grid, 2); // grid width
+
+    // Currently, only the bilinear sampler is supported by cuDNN,
+    // so the sampler method is currently not available as a parameter
+    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
+        dt, 4, out_dims );
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerGradI: could not initialize descriptor (sptf): %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+
+    if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
+        return 1;
+
+    if ( c_set_tensorNd( dy, APPLY_SPECIFIC(dydesc) ) != 0 )
+        return 1;
+
+    if ( c_set_tensorNd( *input_grad, APPLY_SPECIFIC(dxdesc) ) != 0 )
+        return 1;
+
+    // Directly return the outputs if any of the dimensions is 0.
+    // (cuDNN does not support zero-length dimensions.)
+    if ( PyGpuArray_SIZE( *input_grad ) == 0 || PyGpuArray_SIZE( *grid_grad ) == 0 )
+        return 0;
+
+    cuda_enter( gpu_ctx->ctx );
+
+    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    cuda_wait( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    err = cudnnSpatialTfSamplerBackward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
+        APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), beta_p,
+        APPLY_SPECIFIC(dxdesc), PyGpuArray_DEV_DATA( *input_grad ), alpha_p,
+        APPLY_SPECIFIC(dydesc), PyGpuArray_DEV_DATA( dy ), PyGpuArray_DEV_DATA( grid ),
+        beta_p, PyGpuArray_DEV_DATA( *grid_grad ) );
+
+    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    cuda_record( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    cuda_exit( gpu_ctx->ctx );
+
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerGradI: failed to compute gradients of the inputs: %s",
+            cudnnGetErrorString( err ) );
+
+        return 1;
+    }
+
+    return 0;
+}
--- a/theano/gpuarray/c_code/dnn_sptf_grid.c
+++ b/theano/gpuarray/c_code/dnn_sptf_grid.c
+#section support_code_struct
+
+cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(sptf) = NULL;
+
+{
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+    if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
+    {
+        PyErr_Format(PyExc_MemoryError,
+            "GpuDnnTransformerGrid: could not allocate spatial transformer descriptor (sptf): %s",
+            cudnnGetErrorString(err));
+        FAIL;
+    }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(sptf) != NULL) { cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf)); }
+
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(dnn_sptf_grid)(PyGpuArrayObject * theta,
+                              PyArrayObject * out_dims,
+                              PyGpuArrayObject ** grid,
+                              cudnnHandle_t _handle)
+{
+    PyGpuContextObject * gpu_ctx = theta->context;
+    size_t grid_dims[4];
+    int num_images, num_channels, height, width;
+    int desc_dims[4];
+    cudnnDataType_t dt;
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+
+    switch(theta->ga.typecode)
+    {
+    case GA_DOUBLE:
+        dt = CUDNN_DATA_DOUBLE;
+        break;
+    case GA_FLOAT:
+        dt = CUDNN_DATA_FLOAT;
+        break;
+    case GA_HALF:
+        dt = CUDNN_DATA_HALF;
+        break;
+    default:
+        PyErr_SetString( PyExc_TypeError,
+            "GpuDnnTransformerGrid: unsupported data type for theta in spatial transformer." );
+        return 1;
+    }
+
+    if ( PyGpuArray_DIM( theta, 1 ) != 2 || PyGpuArray_DIM( theta, 2 ) != 3 )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerGrid: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
+            PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
+            PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
+        return 1;
+    }
+
+    if ( PyArray_NDIM( out_dims ) != 1 || PyArray_SIZE( out_dims ) != 4 )
+    {
+        PyErr_SetString( PyExc_MemoryError,
+            "GpuDnnTransformerGrid: out_dims must have 4 elements." );
+        return 1;
+    }
+
+    // Obtain output dimensions
+    num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 0 ) );
+    num_channels = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 1 ) );
+    height = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 2 ) );
+    width = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 3 ) );
+
+    // Set transformed output dimensions to setup the descriptor
+    desc_dims[0] = num_images;
+    desc_dims[1] = num_channels;
+    desc_dims[2] = height;
+    desc_dims[3] = width;
+    // Set sampling grid dimensions
+    grid_dims[0] = num_images;
+    grid_dims[1] = height;
+    grid_dims[2] = width;
+    grid_dims[3] = 2;
+
+    // Currently, only the bilinear sampler is supported by cuDNN,
+    // so the sampler method is currently not available as a parameter
+    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
+        dt, 4, desc_dims );
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+
+    if ( theano_prep_output( grid, 4, grid_dims, theta->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+            "GpuDnnTransformerGrid: could not allocate memory for grid of coordinates" );
+        return 1;
+    }
+
+    cuda_enter( gpu_ctx->ctx );
+
+    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    err = cudnnSpatialTfGridGeneratorForward( _handle, APPLY_SPECIFIC(sptf),
+        PyGpuArray_DEV_DATA( theta ), PyGpuArray_DEV_DATA( *grid ) );
+
+    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    cuda_exit( gpu_ctx->ctx );
+
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerGrid: could not create grid of coordinates: %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+
+    return 0;
+}
--- a/theano/gpuarray/c_code/dnn_sptf_gt.c
+++ b/theano/gpuarray/c_code/dnn_sptf_gt.c
+#section support_code_struct
+
+cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(sptf) = NULL;
+
+{
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+    if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
+    {
+        PyErr_Format(PyExc_MemoryError,
+            "GpuDnnTransformerGradT: could not allocate spatial transformer descriptor (sptf): %s",
+            cudnnGetErrorString(err));
+        FAIL;
+    }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(sptf) != NULL)
+    cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
+
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(dnn_sptf_gt)(PyGpuArrayObject * dgrid,
+                            PyGpuArrayObject ** dtheta,
+                            cudnnHandle_t _handle)
+{
+    PyGpuContextObject * gpu_ctx = dgrid->context;
+    int num_images, height, width;
+    int desc_dims[4];
+    size_t dtheta_dims[3];
+    cudnnDataType_t dt;
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+
+    switch(dgrid->ga.typecode)
+    {
+    case GA_DOUBLE:
+        dt = CUDNN_DATA_DOUBLE;
+        break;
+    case GA_FLOAT:
+        dt = CUDNN_DATA_FLOAT;
+        break;
+    case GA_HALF:
+        dt = CUDNN_DATA_HALF;
+        break;
+    default:
+        PyErr_SetString( PyExc_TypeError,
+            "GpuDnnTransformerGradT: unsupported data type for dgrid in spatial transformer." );
+        return 1;
+    }
+
+    num_images = (int) PyGpuArray_DIM( dgrid, 0 );
+    height = (int) PyGpuArray_DIM( dgrid, 1 );
+    width = (int) PyGpuArray_DIM( dgrid, 2 );
+
+    dtheta_dims[0] = num_images;
+    dtheta_dims[1] = 2;
+    dtheta_dims[2] = 3;
+
+    if ( theano_prep_output( dtheta, 3, dtheta_dims, dgrid->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
+        return 1;
+
+    desc_dims[0] = num_images;
+    // Assume number of channels is 1, because the information is not
+    // available or relevant here
+    desc_dims[1] = 1;
+    desc_dims[2] = height;
+    desc_dims[3] = width;
+
+    // Currently, only the bilinear sampler is supported by cuDNN,
+    // so the sampler method is currently not available as a parameter
+    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
+        dt, 4, desc_dims );
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+
+    cuda_enter( gpu_ctx->ctx );
+
+    cuda_wait( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    err = cudnnSpatialTfGridGeneratorBackward( _handle, APPLY_SPECIFIC(sptf),
+        PyGpuArray_DEV_DATA( dgrid ), PyGpuArray_DEV_DATA( *dtheta ) );
+
+    cuda_record( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    cuda_exit( gpu_ctx->ctx );
+
+    if ( err != CUDNN_STATUS_SUCCESS )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerGradT: could not compute gradients of the affine transformation: %s",
+            cudnnGetErrorString( err ) );
+
+        return 1;
+    }
+
+    return 0;
+}
--- a/theano/gpuarray/c_code/dnn_sptf_sampler.c
+++ b/theano/gpuarray/c_code/dnn_sptf_sampler.c
+#section support_code_struct
+
+cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(ydesc);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(sptf) = NULL;
+APPLY_SPECIFIC(xdesc) = NULL;
+APPLY_SPECIFIC(ydesc) = NULL;
+
+{
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+
+    err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
+    if (err != CUDNN_STATUS_SUCCESS)
+    {
+        PyErr_Format(PyExc_MemoryError,
+            "GpuDnnTransformerSampler: could not allocate spatial transformer descriptor (sptf): %s",
+            cudnnGetErrorString( err ));
+        FAIL;
+    }
+
+    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
+    if ( err != CUDNN_STATUS_SUCCESS )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor xdesc: %s",
+            cudnnGetErrorString( err ) );
+        FAIL;
+    }
+
+    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(ydesc) );
+    if ( err != CUDNN_STATUS_SUCCESS )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor ydesc: %s",
+            cudnnGetErrorString( err ) );
+        FAIL;
+    }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(sptf) != NULL)
+    cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
+
+if ( APPLY_SPECIFIC(xdesc) != NULL )
+    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
+
+if ( APPLY_SPECIFIC(ydesc) != NULL )
+    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(ydesc) );
+
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(dnn_sptf_sampler)(PyGpuArrayObject * input,
+                                 PyGpuArrayObject * grid,
+                                 PyGpuArrayObject ** output,
+                                 cudnnHandle_t _handle)
+{
+    PyGpuContextObject * gpu_ctx = input->context;
+    void * alpha_p;
+    void * beta_p;
+    double alpha = 1.0, beta = 0.0;
+    float af = alpha, bf = beta;
+    size_t out_dims[4];
+    int desc_dims[4];
+    cudnnDataType_t dt;
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+
+    switch (input->ga.typecode)
+    {
+    case GA_DOUBLE:
+        alpha_p = (void *)&alpha;
+        beta_p = (void *)&beta;
+        dt = CUDNN_DATA_DOUBLE;
+        break;
+    case GA_FLOAT:
+        alpha_p = (void *)&af;
+        beta_p = (void *)&bf;
+        dt = CUDNN_DATA_FLOAT;
+        break;
+    case GA_HALF:
+        alpha_p = (void *)&af;
+        beta_p = (void *)&bf;
+        dt = CUDNN_DATA_HALF;
+        break;
+    default:
+        PyErr_SetString( PyExc_TypeError,
+            "GpuDnnTransformer: unsupported type for input in spatial transformer." );
+        return 1;
+    }
+
+    out_dims[0] = (size_t) PyGpuArray_DIM(input, 0); // num_images
+    out_dims[1] = (size_t) PyGpuArray_DIM(input, 1); // num_channels
+    out_dims[2] = (size_t) PyGpuArray_DIM(grid, 1); // grid height
+    out_dims[3] = (size_t) PyGpuArray_DIM(grid, 2); // grid width
+    // Set output dimensions for the descriptor setup
+    desc_dims[0] = (int) out_dims[0];
+    desc_dims[1] = (int) out_dims[1];
+    desc_dims[2] = (int) out_dims[2];
+    desc_dims[3] = (int) out_dims[3];
+
+    if ( out_dims[0] == 0 || out_dims[1] == 0 || out_dims[2] == 0 || out_dims[3] == 0 )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+            "GpuDnnTransformerSampler: one of the sampler dimensions is zero" );
+        return 1;
+    }
+
+    if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
+    {
+        PyErr_SetString( PyExc_MemoryError,
+            "GpuDnnTransformerSampler: could not allocate memory for grid sampler" );
+        return 1;
+    }
+
+    // Currently, only the bilinear sampler is supported by cuDNN,
+    // so the sampler method is currently not available as a parameter
+    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
+        dt, 4, desc_dims );
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_MemoryError,
+            "GpuDnnTransformerSampler: could not initialize descriptor: %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+
+    if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
+        return 1;
+
+    if ( c_set_tensorNd( *output, APPLY_SPECIFIC(ydesc) ) != 0 )
+        return 1;
+
+    cuda_enter( gpu_ctx->ctx );
+
+    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    err = cudnnSpatialTfSamplerForward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
+        APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( grid ),
+        beta_p, APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
+
+    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+
+    cuda_exit( gpu_ctx->ctx );
+
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerSampler: could not create grid sampler: %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+
+    return 0;
+}
+
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -13,6 +13,7 @@ from theano import Op, Apply, tensor, config, Variable
 from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
                           int32 as int_t, bool as bool_t, uint32 as uint32_t)
 from theano.tensor import as_tensor_variable, Argmax
+from theano.tensor.extra_ops import cpu_contiguous
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
 from theano.gof.cmodule import GCC_compiler
@@ -2782,6 +2783,235 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
    return result


+class GpuDnnTransformerGrid(DnnBase):
+    """
+    Grid generator Op for cuDNN Spatial Transformer.
+    """
+    __props__ = ()
+    _cop_num_inputs = 2
+    _cop_num_outputs = 1
+    _f16_ok = True
+    check_input = False
+
+    def __init__(self):
+        DnnBase.__init__(self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")
+
+    def make_node(self, theta, out_dims):
+        """
+        Create a grid generator node for a cuDNN Spatial Transformer
+
+        Parameters
+        ----------
+        theta : tensor
+            Affine transformation tensor containing one affine transformation
+            matrix per image. ``theta`` is usually generated by the localization
+            network.
+
+        out_dims : tuple
+            Dimensions of the transformed inputs, containing four elements, and is given
+            by (N, C, H, W), where N is the number of inputs, C the number of channels,
+            H and W are the height and width of each input.
+        """
+        context_name = infer_context_name(theta)
+
+        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
+        assert theta.dtype in ('float16', 'float32', 'float64')
+        assert theta.ndim == 3
+
+        out_dims = cpu_contiguous(as_tensor_variable(out_dims))
+        assert out_dims.dtype in theano.tensor.basic.integer_dtypes
+        assert out_dims.ndim == 1
+        # Ensure 64-bit ints are passed to the C code
+        out_dims = theano.tensor.basic.cast(out_dims, 'int64')
+        grid = GpuArrayType(dtype=theta.dtype,
+                            broadcastable=(theta.type.ndim + 1) * (False,),
+                            context_name=context_name)()
+
+        inputs = [theta, out_dims]
+        outputs = [grid]
+        return Apply(self, inputs, outputs)
+
+    def grad(self, inputs, grads):
+        theta, out_dims = inputs
+        dgrid = grads[0]
+
+        dtheta = GpuDnnTransformerGradT()(dgrid)
+        return [dtheta, grad_not_implemented(self, 1, out_dims)]
+
+
+class GpuDnnTransformerSampler(DnnBase):
+    """
+    Grid sampler Op for cuDNN Spatial Transformer.
+    """
+    __props__ = ()
+    _cop_num_inputs = 2
+    _cop_num_outputs = 1
+    _f16_ok = True
+    check_input = False
+
+    def __init__(self):
+        DnnBase.__init__(self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)")
+
+    def make_node(self, img, grid):
+        """
+        Create a grid sampler node for a cuDNN Spatial Transformer
+
+        Parameters
+        ----------
+        img : tensor
+            Images from which the pixels will be sampled. The implementation
+            assumes the tensor is in NCHW format, where N is the number of images,
+            C is the number of color channels, H is the height of the inputs, and
+            W is width of the inputs.
+
+        grid : GpuDnnTransformerGrid
+            Grid that contains the coordinates of the pixels to be sampled from
+            the inputs images.
+        """
+        context_name = infer_context_name(img, grid)
+
+        img = gpu_contiguous(as_gpuarray_variable(img, context_name))
+        if img.type.ndim != 4:
+            raise TypeError('img must be a 4D tensor')
+        elif img.dtype not in ('float16', 'float32', 'float64'):
+            raise TypeError('img type must be floating-point')
+
+        grid = gpu_contiguous(as_gpuarray_variable(grid, context_name))
+        if grid.type.ndim != 4:
+            raise TypeError('grid must be a 4D tensor')
+        elif grid.dtype not in ('float16', 'float32', 'float64'):
+            raise TypeError('grid type must be floating-point')
+
+        out = GpuArrayType(dtype=img.dtype,
+                           broadcastable=img.type.ndim * (False,),
+                           context_name=context_name)()
+
+        inputs = [img, grid]
+        outputs = [out]
+        return Apply(self, inputs, outputs)
+
+    def grad(self, inputs, grads):
+        img, grid = inputs
+        dy = grads[0]
+
+        dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy)
+        return [dimg, dgrid]
+
+
+class GpuDnnTransformerGradI(DnnBase):
+    """
+    Gradient of inputs Op for cuDNN Spatial Transformer.
+    """
+    __props__ = ()
+    _cop_num_inputs = 3
+    _cop_num_outputs = 2
+    _f16_ok = True
+    check_input = False
+
+    def __init__(self):
+        DnnBase.__init__(self, ["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")
+
+    def make_node(self, img, grid, dy):
+        context_name = infer_context_name(img, grid, dy)
+
+        img = as_gpuarray_variable(gpu_contiguous(img), context_name)
+        if img.ndim != 4:
+            raise TypeError('img must have 4 dimensions.')
+
+        grid = as_gpuarray_variable(gpu_contiguous(grid), context_name)
+        if img.ndim != grid.ndim:
+            raise TypeError('grid should have the same number of dimensions as img')
+
+        dy = as_gpuarray_variable(dy, context_name)
+        if dy.ndim != 4:
+            raise TypeError('dy must have 4 dimensions.')
+
+        dimg = img.type()
+        dgrid = grid.type()
+
+        inputs = [img, grid, dy]
+        outputs = [dimg, dgrid]
+
+        return Apply(self, inputs, outputs)
+
+
+class GpuDnnTransformerGradT(DnnBase):
+    """
+    Gradient of affine transformations Op for cuDNN Spatial Transformer.
+    """
+    __props__ = ()
+    _cop_num_inputs = 1
+    _cop_num_outputs = 1
+    _f16_ok = True
+    check_input = False
+
+    def __init__(self):
+        DnnBase.__init__(self, ["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")
+
+    def make_node(self, dgrid):
+        context_name = infer_context_name(dgrid)
+
+        dgrid = as_gpuarray_variable(dgrid, context_name)
+        assert dgrid.dtype in ('float16', 'float32', 'float64')
+        assert dgrid.ndim == 4
+
+        dtheta = GpuArrayType(dtype=dgrid.dtype,
+                              broadcastable=(dgrid.type.ndim - 1) * (False,),
+                              context_name=context_name)()
+        inputs = [dgrid]
+        outputs = [dtheta]
+
+        return Apply(self, inputs, outputs)
+
+
+def dnn_spatialtf(img, theta, scale_width=1, scale_height=1):
+    """
+    GPU spatial transformer using cuDNN from NVIDIA.
+
+    Parameters
+    ----------
+    img : tensor
+        Images to which the transformations will be applied. The implementation
+        assumes the tensor is in NCHW format, where N is the number of images,
+        C is the number of color channels, H is the height of the inputs, and
+        W is width of the inputs.
+    theta : tensor
+        Affine transformation tensor containing one affine transformation
+        matrix per image. ``theta`` is usually generated by the localization
+        network.
+    scale_height: float
+        A float specifying the scaling factor for the height of the output
+        image. A value of 1 will keep the original height of the input. Values
+        larger than 1 will upsample the input. Values below 1 will downsample
+        the input.
+    scale_width: float
+        A float specifying the scaling factor for the width of the output
+        image. A value of 1 will keep the original width of the input. Values
+        larger than 1 will upsample the input. Values below 1 will downsample
+        the input.
+
+    Returns
+    -------
+    out : tensor
+        Transformed images with width and height properly scaled.
+
+    Notes
+    -----
+    Currently, cuDNN only supports 2D transformations with 2x3 affine
+    transformation matrices.
+
+    Bilinear interpolation is the only grid sampler method available.
+    """
+    out_dims = (img.shape[0], img.shape[1],
+                theano.tensor.ceil(img.shape[2] * scale_height),
+                theano.tensor.ceil(img.shape[3] * scale_width))
+    out_dims = tuple([as_scalar(v).astype('int64') for v in out_dims])
+    # Setup spatial transformer
+    grid = GpuDnnTransformerGrid()(theta, out_dims)
+    sampler = GpuDnnTransformerSampler()(img, grid)
+    return sampler
+
+
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,
@@ -3137,6 +3367,7 @@ def local_dnn_convgw_inplace(node, inputs):
 def local_dnn_convgi_inplace(node, inputs):
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]

+
 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2296,3 +2296,226 @@ class Cudnn_grouped_conv(Grouped_conv_noOptim):
    conv_gradi_op = dnn.GpuDnnConvGradI
    flip_filter = False
    is_dnn = True
+
+
+def test_dnn_spatialtf():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+
+    utt.seed_rng()
+
+    """
+    Spatial Transformer implementation using Theano from Lasagne
+    Original author: skaae (https://github.com/skaae)
+    """
+    def spatialtf_cpu(inp, theta, scale_height, scale_width, border_mode='nearest'):
+        num_batch, num_channels, height, width = inp.shape
+        theta = T.reshape(theta, (-1, 2, 3))
+
+        # grid of (x_t, y_t, 1), eq (1) in ref [1]
+        out_height = T.cast(T.ceil(height * scale_height), 'int64')
+        out_width = T.cast(T.ceil(width * scale_width), 'int64')
+        grid = _meshgrid(out_height, out_width)
+        # transform a x (x_t, y_t, 1)^t -> (x_s, y_s)
+        t_g = T.dot(theta, grid)
+        x_s = t_g[:, 0]
+        y_s = t_g[:, 1]
+        x_s_flat = x_s.flatten()
+        y_s_flat = y_s.flatten()
+
+        # dimshuffle input to  (bs, height, width, channels)
+        input_dim = inp.dimshuffle(0, 2, 3, 1)
+        input_transformed = _interpolate(
+            input_dim, x_s_flat, y_s_flat,
+            out_height, out_width, border_mode)
+
+        output = T.reshape(
+            input_transformed, (num_batch, out_height, out_width, num_channels))
+        output = output.dimshuffle(0, 3, 1, 2)  # dimshuffle to conv format
+        return output
+
+    def _interpolate(im, x, y, out_height, out_width, border_mode):
+        # *_f are floats
+        num_batch, height, width, channels = im.shape
+        height_f = T.cast(height, theano.config.floatX)
+        width_f = T.cast(width, theano.config.floatX)
+
+        # scale coordinates from [-1, 1] to [0, dimension - 1], where dimension
+        # can be the width or height
+        x = (x + 1) / 2 * (width_f - 1)
+        y = (y + 1) / 2 * (height_f - 1)
+
+        # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
+        # we need those in floatX for interpolation and in int64 for indexing.
+        x0_f = T.floor(x)
+        y0_f = T.floor(y)
+        x1_f = x0_f + 1
+        y1_f = y0_f + 1
+
+        # for indexing, we need to take care of the border mode for outside pixels.
+        if border_mode == 'nearest':
+            x0 = T.clip(x0_f, 0, width_f - 1)
+            x1 = T.clip(x1_f, 0, width_f - 1)
+            y0 = T.clip(y0_f, 0, height_f - 1)
+            y1 = T.clip(y1_f, 0, height_f - 1)
+        elif border_mode == 'mirror':
+            w = 2 * (width_f - 1)
+            x0 = T.minimum(x0_f % w, -x0_f % w)
+            x1 = T.minimum(x1_f % w, -x1_f % w)
+            h = 2 * (height_f - 1)
+            y0 = T.minimum(y0_f % h, -y0_f % h)
+            y1 = T.minimum(y1_f % h, -y1_f % h)
+        elif border_mode == 'wrap':
+            x0 = T.mod(x0_f, width_f)
+            x1 = T.mod(x1_f, width_f)
+            y0 = T.mod(y0_f, height_f)
+            y1 = T.mod(y1_f, height_f)
+        else:
+            raise ValueError("border_mode must be one of "
+                             "'nearest', 'mirror', 'wrap'")
+        x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1))
+
+        # The input is [num_batch, height, width, channels]. We do the lookup in
+        # the flattened input, i.e [num_batch*height*width, channels]. We need
+        # to offset all indices to match the flat version
+        dim2 = width
+        dim1 = width * height
+        base = T.repeat(
+            T.arange(num_batch, dtype='int64') * dim1, out_height * out_width)
+        base_y0 = base + y0 * dim2
+        base_y1 = base + y1 * dim2
+        idx_a = base_y0 + x0
+        idx_b = base_y1 + x0
+        idx_c = base_y0 + x1
+        idx_d = base_y1 + x1
+
+        # use indices to lookup pixels for all samples
+        im_flat = im.reshape((-1, channels))
+        Ia = im_flat[idx_a]
+        Ib = im_flat[idx_b]
+        Ic = im_flat[idx_c]
+        Id = im_flat[idx_d]
+
+        # calculate interpolated values
+        wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x')
+        wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x')
+        wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x')
+        wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x')
+        output = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0)
+        return output
+
+    def _linspace(start, stop, num):
+        # Theano linspace. Behaves similar to np.linspace
+        start = T.cast(start, theano.config.floatX)
+        stop = T.cast(stop, theano.config.floatX)
+        num = T.cast(num, theano.config.floatX)
+        step = (stop - start) / (num - 1)
+        return T.arange(num, dtype=theano.config.floatX) * step + start
+
+    def _meshgrid(height, width):
+        # This function is the grid generator from eq. (1) in reference [1].
+        # It is equivalent to the following numpy code:
+        #  x_t, y_t = np.meshgrid(np.linspace(-1, 1, width),
+        #                         np.linspace(-1, 1, height))
+        #  ones = np.ones(np.prod(x_t.shape))
+        #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
+        # It is implemented in Theano instead to support symbolic grid sizes.
+        # Note: If the image size is known at layer construction time, we could
+        # compute the meshgrid offline in numpy instead of doing it dynamically
+        # in Theano. However, it hardly affected performance when we tried.
+        x_t = T.dot(T.ones((height, 1)),
+                    _linspace(-1.0, 1.0, width).dimshuffle('x', 0))
+        y_t = T.dot(_linspace(-1.0, 1.0, height).dimshuffle(0, 'x'),
+                    T.ones((1, width)))
+
+        x_t_flat = x_t.reshape((1, -1))
+        y_t_flat = y_t.reshape((1, -1))
+        ones = T.ones_like(x_t_flat)
+        grid = T.concatenate([x_t_flat, y_t_flat, ones], axis=0)
+        return grid
+
+    img_dims = (5, 3, 16, 16)
+    img = np.random.random(size=img_dims).astype(theano.config.floatX)
+
+    scale_height = 0.25
+    scale_width = 0.75
+
+    # Transformation matrix
+    transform = [[-1, 0, 0],
+                 [0, -1, 0]]
+    theta = np.asarray(img_dims[0] * [transform], dtype=theano.config.floatX)
+
+    # Create symbolic variables for inputs and transformations
+    t_img = T.tensor4('img')
+    t_theta = T.tensor3('theta')
+
+    st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height, scale_width=scale_width)
+    st_dnn_func = theano.function([t_img, t_theta], st_dnn)
+    # Check if function graph contains the spatial transformer's grid and sampler Ops
+    apply_nodes = st_dnn_func.maker.fgraph.apply_nodes
+    assert any([isinstance(node.op, dnn.GpuDnnTransformerGrid) for node in apply_nodes])
+    assert any([isinstance(node.op, dnn.GpuDnnTransformerSampler) for node in apply_nodes])
+
+    img_out_gpu = st_dnn_func(img, theta)
+    img_out_gpu = np.asarray(img_out_gpu)
+
+    # Setup CPU Op
+    st_cpu = spatialtf_cpu(t_img, t_theta, scale_height, scale_width, 'nearest')
+    st_cpu_func = theano.function([t_img, t_theta], st_cpu, mode=mode_without_gpu)
+    img_out_cpu = st_cpu_func(img, theta)
+
+    atol, rtol = None, None
+    if theano.config.floatX == 'float16':
+        # Raise relative error tolerance when using float16
+        rtol = 5e-2
+    utt.assert_allclose(img_out_cpu, img_out_gpu, atol=atol, rtol=rtol)
+
+
+def test_dnn_spatialtf_grad():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+
+    utt.seed_rng()
+
+    inputs = T.tensor4('inputs')
+    theta = T.tensor3('theta')
+
+    out = dnn.dnn_spatialtf(inputs, theta, scale_height=0.25, scale_width=0.75)
+    out_mean = T.mean(out)
+    mean_gi = T.grad(out_mean, [inputs])
+    mean_gt = T.grad(out_mean, [theta])
+
+    f_gi = theano.function([inputs, theta], mean_gi)
+    assert any([isinstance(node.op, dnn.GpuDnnTransformerGradI)
+                for node in f_gi.maker.fgraph.apply_nodes])
+
+    f_gt = theano.function([inputs, theta], mean_gt)
+    assert any([isinstance(node.op, dnn.GpuDnnTransformerGradT)
+                for node in f_gt.maker.fgraph.apply_nodes])
+
+    input_dims = (5, 3, 16, 16)
+    inputs_val = np.random.random(size=input_dims).astype(theano.config.floatX)
+
+    # Tensor with transformations
+    theta_val = np.random.random((input_dims[0], 2, 3)).astype(theano.config.floatX)
+    # Using smaller values for theta, increases the precision of gradients
+    # when using lower precision. Tests might fail for lower precision data
+    # types if the values of theta or the inputs are very high.
+    theta /= 100
+
+    # Check that the gradients are computed
+    f_gi(inputs_val, theta_val)
+    f_gt(inputs_val, theta_val)
+
+    def grad_functor(inputs, theta):
+        out = dnn.dnn_spatialtf(inputs, theta)
+        return out
+
+    atol, rtol = None, None
+    if theano.config.floatX == 'float32':
+        rtol = 5e-2
+    elif theano.config.floatX == 'float16':
+        rtol = 1e-0
+
+    utt.verify_grad(grad_functor, [inputs_val, theta_val], mode=mode_with_gpu,
+                    abs_tol=atol, rel_tol=rtol)