Merge spatial transformer implementation into a single Op, GpuDnnTransformer

Signed-off-by: João Victor Tozatti Risso <joaovictor.risso@gmail.com>

Merge spatial transformer implementation into a single Op, GpuDnnTransformer
c99cc112 · João Victor Tozatti Risso · 09e362f2 · c99cc112 · 09e362f2 · c99cc112
--- a/theano/gpuarray/c_code/dnn_sptf.c
+++ b/theano/gpuarray/c_code/dnn_sptf.c
 #section support_code
 typedef struct __spatialtf_context {
+    PyGpuArrayObject * grid;
    cudnnTensorDescriptor_t xdesc;
    cudnnTensorDescriptor_t ydesc;
 } spatialtf_context_t;
 void spatialtf_context_init( spatialtf_context_t * ctx )
 {
+    if ( ctx == NULL )
+        return;
+    ctx->grid = NULL;
    ctx->xdesc = NULL;
    ctx->ydesc = NULL;
 }
 void spatialtf_context_destroy( spatialtf_context_t * ctx )
 {
+    Py_XDECREF( ctx->grid );
    if ( NULL != ctx->xdesc )
        cudnnDestroyTensorDescriptor( ctx->xdesc );
@@ -23,8 +30,9 @@ void spatialtf_context_destroy( spatialtf_context_t * ctx )
 #section support_code_struct
 int
-spatialtf_sampler(PyGpuArrayObject * input,
+dnn_sptf(PyGpuArrayObject * input,
-                  PyGpuArrayObject * grid,
+         PyGpuArrayObject * theta,
+         PyArrayObject * grid_dims,
         cudnnSpatialTransformerDescriptor_t desc,
         double alpha, double beta,
         PyGpuArrayObject ** output,
@@ -39,18 +47,6 @@ spatialtf_sampler(PyGpuArrayObject * input,
    cudnnTensorFormat_t tf = CUDNN_TENSOR_NCHW;
    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    if ( PyGpuArray_NDIM( grid ) != 4 )
-    {
-        PyErr_SetString( PyExc_RuntimeError,
-                         "grid_dimensions must have 4 dimensions" );
-        return -1;
-    }
-    // Obtain grid dimensions
-    const int num_images = (int) PyGpuArray_DIM( grid, 0 );
-    const int height = (int) PyGpuArray_DIM( grid, 1 );
-    const int width = (int) PyGpuArray_DIM( grid, 2 );
    switch (input->ga.typecode)
    {
    case GA_DOUBLE:
@@ -70,7 +66,63 @@ spatialtf_sampler(PyGpuArrayObject * input,
        break;
    default:
        PyErr_SetString( PyExc_TypeError,
-                         "Unsupported type in spatial transformer sampler" );
+                         "GpuDnnTransformer: unsupported type in spatial transformer sampler" );
+        return -1;
+    }
+    if ( ! GpuArray_IS_C_CONTIGUOUS( &(input->ga) ) )
+    {
+        PyErr_SetString( PyExc_MemoryError,
+                         "GpuDnnTransformer: input data is not C-contiguous" );
+        return -1;
+    }
+    if ( theta->ga.typecode != GA_FLOAT &&
+         theta->ga.typecode != GA_DOUBLE &&
+         theta->ga.typecode != GA_HALF )
+    {
+        PyErr_SetString( PyExc_TypeError, "GpuDnnTransformer: unsupported data type for theta" );
+        return -1;
+    }
+    else if ( PyGpuArray_NDIM( theta ) != 3 )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+                      "GpuDnnTransformer: theta must have three dimensions!" );
+        return -1;
+    }
+    else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+                      "GpuDnnTransformer: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
+                      PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
+                      PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
+        return -1;
+    }
+    else if ( ! GpuArray_IS_C_CONTIGUOUS( &(theta->ga) ) )
+    {
+        PyErr_SetString( PyExc_MemoryError,
+                         "GpuDnnTransformer: theta is not C-contiguous" );
+        return -1;
+    }
+    if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+                         "GpuDnnTransformer: grid_dims must have 4 elements." );
+        return -1;
+    }
+    // Obtain grid dimensions
+    const int num_images = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 0 ) );
+    const int num_channels = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 1 ) );
+    const int height = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 2 ) );
+    const int width = (int) *( (npy_int *) PyArray_GETPTR1( grid_dims, 3 ) );
+    const size_t gpu_grid_dims[4] = { num_images, height, width, 2 };
+    if ( width == 0 || height == 0 || num_images == 0 )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+                         "GpuDnnTransformer: grid_dims has a dimension with value zero" );
        return -1;
    }
@@ -78,6 +130,16 @@ spatialtf_sampler(PyGpuArrayObject * input,
    cuda_enter( gpu_ctx->ctx );
+    spatialtf_ctx.grid = pygpu_empty(4, &(gpu_grid_dims[0]), input->ga.typecode, GA_C_ORDER,
+        gpu_ctx, Py_None);
+    if ( spatialtf_ctx.grid == NULL )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+                         "GpuDnnTransformer: could not allocate memory for grid of coordinates" );
+        return -1;
+    }
    err = cudnnCreateTensorDescriptor( &(spatialtf_ctx.xdesc) );
    if ( err != CUDNN_STATUS_SUCCESS )
@@ -86,7 +148,7 @@ spatialtf_sampler(PyGpuArrayObject * input,
        cuda_exit( gpu_ctx->ctx );
        PyErr_Format( PyExc_RuntimeError,
-                      "Could not create xdesc: %s",
+                      "GpuDnnTransformer: could not create xdesc: %s",
                      cudnnGetErrorString(err) );
        return -1;
    }
@@ -99,10 +161,10 @@ spatialtf_sampler(PyGpuArrayObject * input,
    const int input_height = (int) PyGpuArray_DIM( input, 2 );
    const int input_width = (int) PyGpuArray_DIM( input, 3 );
-    if ( input_num_images != num_images )
+    if ( input_num_images != num_images || input_num_channels != num_channels )
    {
        PyErr_Format( PyExc_RuntimeError,
-                      "Input should have %d images, got %d images.",
+                      "GpuDnnTransformer: expected input to have %d inputs, got %d inputs.",
                      num_images, input_num_images );
        return -1;
    }
@@ -116,7 +178,7 @@ spatialtf_sampler(PyGpuArrayObject * input,
        cuda_exit( gpu_ctx->ctx );
        PyErr_Format( PyExc_RuntimeError,
-                      "Could not initialize xdesc: %s",
+                      "GpuDnnTransformer: failed to initialize xdesc: %s",
                      cudnnGetErrorString(err) );
        return -1;
    }
@@ -129,7 +191,7 @@ spatialtf_sampler(PyGpuArrayObject * input,
        cuda_exit( gpu_ctx->ctx );
        PyErr_Format( PyExc_RuntimeError,
-                      "Could not create ydesc: %s",
+                      "GpuDnnTransformer: failed to create ydesc: %s",
                      cudnnGetErrorString(err) );
        return -1;
    }
@@ -143,70 +205,51 @@ spatialtf_sampler(PyGpuArrayObject * input,
        cuda_exit( gpu_ctx->ctx );
        PyErr_Format( PyExc_RuntimeError,
-                      "Could not initialize ydesc: %s",
+                      "GpuDnnTransformer: failed to initialize ydesc: %s",
                      cudnnGetErrorString(err) );
        return -1;
    }
    const size_t out_dims[4] = { num_images, input_num_channels, height, width };
-    if ( NULL == *output ||
+    if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
-         ! theano_size_check( *output, 4, out_dims, (*output)->ga.typecode ) )
+                             GA_C_ORDER, gpu_ctx ) != 0 )
-    {
-        Py_XDECREF( *output );
-        *output = pygpu_empty( 4, out_dims, input->ga.typecode, GA_C_ORDER,
-            gpu_ctx, Py_None );
-        if ( NULL == *output )
    {
        spatialtf_context_destroy( &spatialtf_ctx );
        cuda_exit( gpu_ctx->ctx );
        PyErr_SetString( PyExc_MemoryError,
-                             "Could allocate memory for spatial transformer's grid sampler" );
+                         "GpuDnnTransformer: could not allocate memory for grid sampler" );
        return -1;
    }
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &(input->ga) ) )
+    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    {
+    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-        PyErr_SetString( PyExc_MemoryError,
+    cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-                         "input data is not C-contiguous" );
-        return -1;
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &(grid->ga) ) )
+    err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
-    {
+        PyGpuArray_DEV_DATA( spatialtf_ctx.grid ) );
-        PyErr_SetString( PyExc_MemoryError,
-                         "grid data is not C-contiguous" );
-        return -1;
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &((*output)->ga) ) )
+    if ( CUDNN_STATUS_SUCCESS != err )
    {
-        PyErr_SetString( PyExc_MemoryError,
+        PyErr_Format( PyExc_RuntimeError,
-                         "theta data is not C-contiguous" );
+                      "GpuDnnTransformer: failed to create grid of coordinates: %s",
+                      cudnnGetErrorString( err ) );
        return -1;
    }
-    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    const void * input_data = PyGpuArray_DEV_DATA( input );
-    const void * grid_data = PyGpuArray_DEV_DATA( grid );
-    void * out_data = PyGpuArray_DEV_DATA( *output );
    err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, spatialtf_ctx.xdesc,
-        input_data, grid_data, beta_p, spatialtf_ctx.ydesc, out_data );
+        PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( spatialtf_ctx.grid ),
+        beta_p, spatialtf_ctx.ydesc, PyGpuArray_DEV_DATA( *output ) );
    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
    cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
    if ( CUDNN_STATUS_SUCCESS != err )
    {
+        PyErr_SetString( PyExc_RuntimeError,
+                         "GpuDnnTransformer: failed to create grid sampler" );
        spatialtf_context_destroy( &spatialtf_ctx );
        cuda_exit( gpu_ctx->ctx );
        return -1;

--- a/theano/gpuarray/c_code/spatialtf_grid.c
+++ b/theano/gpuarray/c_code/spatialtf_grid.c
-#section support_code
-int
-spatialtf_grid(PyArrayObject * grid_dimensions,
-               PyGpuArrayObject * theta,
-               cudnnSpatialTransformerDescriptor_t desc,
-               PyGpuArrayObject ** grid,
-               cudnnHandle_t _handle)
-{
-    PyGpuContextObject * gpu_ctx = theta->context;
-    cudnnStatus_t err;
-    if ( theta->ga.typecode != GA_FLOAT &&
-         theta->ga.typecode != GA_DOUBLE &&
-         theta->ga.typecode != GA_HALF )
-    {
-        PyErr_SetString( PyExc_TypeError, "Unsupported data type for theta" );
-        return -1;
-    }
-    if ( PyGpuArray_NDIM( theta ) != 3 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "theta must have three dimensions!" );
-        return -1;
-    }
-    if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "Incorrect dimensions for theta, should be (%d, %d, %d), got (%d, %d, %d)",
-                      PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
-                      PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
-        return -1;
-    }
-    if ( PyArray_DIM( grid_dimensions, 0 ) != 4 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "grid_dimensions must have 4 dimensions!" );
-        return -1;
-    }
-    // Obtain grid dimensions
-    const size_t num_images = (size_t) *( (npy_int *) PyArray_GETPTR1( grid_dimensions, 0 ) );
-    // Dimension 1 is the number of image channels
-    const size_t height = (size_t) *( (npy_int *) PyArray_GETPTR1( grid_dimensions, 2 ) );
-    const size_t width = (size_t) *( (npy_int *) PyArray_GETPTR1( grid_dimensions, 3 ) );
-    // Grid of coordinates is of size num_images * height * width * 2 for a 2D transformation
-    const size_t grid_dims[4] = { num_images, height, width, 2 };
-    if ( width == 0 || height == 0 || num_images == 0 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "One of the grid dimensions is zero" );
-        return -1;
-    }
-    if ( NULL == *grid ||
-         ! theano_size_check( *grid, 4, grid_dims, (*grid)->ga.typecode ) )
-    {
-        Py_XDECREF( *grid );
-        *grid = pygpu_empty( 4, grid_dims, theta->ga.typecode, GA_C_ORDER,
-            gpu_ctx, Py_None );
-        if ( NULL == *grid )
-        {
-            PyErr_SetString( PyExc_MemoryError,
-                             "Could not allocate memory for grid of coordinates" );
-            return -1;
-        }
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &(theta->ga) ) )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-                         "theta data is not C-contiguous" );
-        return -1;
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &((*grid)->ga) ) )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-                         "grid data is not C-contiguous" );
-        return -1;
-    }
-    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    const void * theta_data = PyGpuArray_DEV_DATA( theta );
-    void * grid_data = PyGpuArray_DEV_DATA( *grid );
-    err = cudnnSpatialTfGridGeneratorForward( _handle, desc, theta_data, grid_data );
-    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "Failed to create grid of coordinates: %s",
-                      cudnnGetErrorString( err ) );
-        return -1;
-    }
-    return 0;
-}
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2833,7 +2833,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]
-class GpuDnnSpatialTfDesc(COp):
+class _GpuDnnTransformerDescriptor(COp):
    """
    This Op builds a spatial transformer descriptor for use in spatial transformer network
@@ -2859,13 +2859,13 @@ class GpuDnnSpatialTfDesc(COp):
        return False
    def __init__(self, dtype=theano.config.floatX):
-        COp.__init__(self, ["c_code/spatialtf_desc.c"], "APPLY_SPECIFIC(spatialtf_desc)")
+        COp.__init__(self, ["c_code/dnn_sptf_desc.c"], "APPLY_SPECIFIC(dnn_sptf_desc)")
        assert cudnn.cudnnDataType_t.has_alias(dtype)
        self.dtype = dtype
    def make_node(self, dimensions):
-        # cuDNN supports only 2D transformations, therefore output tensor must
+        # cuDNN supports only 2D transformations, and the output tensor must
        # have exactly 4 dimensions: (num_images, num_channels, height, width)
        assert len(dimensions) == 4
        dimensions = tuple(dimensions)
@@ -2883,63 +2883,31 @@ class GpuDnnSpatialTfDesc(COp):
        return node
    def c_code_cache_version(self):
-        return (super(GpuDnnSpatialTfDesc, self).c_code_cache_version(), version())
+        return (super(_GpuDnnTransformerDescriptor, self).c_code_cache_version(), version())
-class GpuDnnGridGenerator(DnnBase):
+class GpuDnnTransformer(DnnBase):
    """
-    This Op builds a spatial transformer grid generator for use in spatial transformer network
+    This Op builds a spatial transformer that can be used in spatial transformer networks.
-    operations.
    """
    __props__ = ('dtype',)
-    _cop_num_inputs = 3
+    _cop_num_inputs = 6
    _cop_num_outputs = 1
+    _f16_ok = True
    def __init__(self, dtype):
-        DnnBase.__init__(self, ["c_code/spatialtf_grid.c"], "spatialtf_grid")
+        DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf")
        self.dtype = dtype
-    def make_node(self, grid_dimensions, theta, desc):
+    def make_node(self, img, theta, grid_dims, desc, alpha=None, beta=None):
-        context_name = infer_context_name(desc, theta)
-        grid_dimensions = as_tensor_variable(grid_dimensions)
-        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
        assert theta.dtype in ('float16', 'float32', 'float64')
-        # Allocate GPU memory for grid of coordinates
+        context_name = infer_context_name(img)
-        grid = GpuArrayType(dtype=self.dtype,
-                            broadcastable=(False, False, False, False,),
-                            context_name=context_name)()
-        return Apply(self, [grid_dimensions, theta, desc], [grid])
-    def L_op(self, inputs, outputs, output_grads):
-        pass
-class GpuDnnGridSampler(DnnBase):
-    """
-    This Op builds a spatial transformer grid sampler for use in spatial transformer network
-    operations.
-    """
-    __props__ = ('dtype',)
-    _cop_num_inputs = 5
-    _cop_num_outputs = 1
-    def __init__(self, dtype):
-        DnnBase.__init__(self, ["c_code/spatialtf_sampler.c"], "spatialtf_sampler")
-        self.dtype = dtype
-    def make_node(self, img, grid, desc, alpha=None, beta=None):
-        context_name = infer_context_name(img, grid)
+        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
        img = as_gpuarray_variable(img, context_name)
-        grid = as_gpuarray_variable(grid, context_name)
+        grid_dims = as_tensor_variable(grid_dims)
        output = GpuArrayType(dtype=self.dtype,
                              broadcastable=img.type.ndim * (False,),
@@ -2955,9 +2923,9 @@ class GpuDnnGridSampler(DnnBase):
        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
        beta = ensure_dt(beta, _zero, 'beta', img.dtype)
-        return Apply(self, [img, grid, desc, alpha, beta], [output])
+        return Apply(self, [img, theta, grid_dims, desc, alpha, beta], [output])
-    def L_op(self, inputs, outputs, output_grads):
+    def L_op(self, inputs, outputs, grads):
        pass
@@ -3011,13 +2979,12 @@ def dnn_spatialtf(inp, theta, scale_width=1, scale_height=1, alpha=None, beta=No
    theta = gpu_contiguous(theta)
    # Create spatial transformer descriptor
-    desc = GpuDnnSpatialTfDesc(dtype)(grid_dims)
+    desc = _GpuDnnTransformerDescriptor(dtype)(grid_dims)
    # Create grid dimensions variable
    grid_dims_var = as_tensor_variable(grid_dims)
-    # Setup and return sampling grid
+    # Setup spatial transformer
-    grid_coord = GpuDnnGridGenerator(dtype)(grid_dims_var, theta, desc)
+    transformer = GpuDnnTransformer(dtype)(inp, theta, grid_dims_var, desc, alpha, beta)
-    grid_sampler = GpuDnnGridSampler(dtype)(inp, grid_coord, desc, alpha, beta)
+    return transformer
-    return grid_sampler
 @local_optimizer([AbstractConv2d, AbstractConv3d])

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2440,7 +2440,6 @@ def test_dnn_spatialtf():
    img = np.random.randint(low=0, high=256, size=img_dims)
    # Convert from NHWC to NCHW
    img = np.transpose(img, axes=(0, 3, 1, 2)).astype(theano.config.floatX)
-    gpu_img = gpuarray_shared_constructor(img)
    # Downsample image dimensions by a factor of 2, i.e. our output tensor will
    # have shape (n, c, h / 2, w / 2)
    scale_height = 0.25
@@ -2451,25 +2450,25 @@ def test_dnn_spatialtf():
             [0, -1, 0]]
    transform = np.asarray(img_dims[0] * [theta], dtype=theano.config.floatX)
-    gpu_transform = gpuarray_shared_constructor(transform)
-    st_dnn = dnn.dnn_spatialtf(gpu_img, gpu_transform, scale_height=scale_height,
+    # Create symbolic variables for inputs and transformations
+    t_img = T.tensor4('img')
+    t_theta = T.tensor3('theta')
+    st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height,
                               scale_width=scale_width)
-    st_dnn_func = theano.function([], [st_dnn])
+    st_dnn_func = theano.function([t_img, t_theta], [st_dnn])
+    img_out_gpu, = st_dnn_func(img, transform)
+    img_out = np.asarray(img_out_gpu)
    # Check if function graph contains the spatial transformer Ops
    topo = st_dnn_func.maker.fgraph.toposort()
-    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnGridGenerator)]) == 1
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnTransformer)]) == 1
-    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnGridSampler)]) == 1
    # Setup CPU Op
-    t_img = T.tensor4('img')
-    t_theta = T.tensor3('theta')
    st_cpu = spatialtf_cpu(t_theta, t_img, scale_height, scale_width, 'nearest')
    st_cpu_func = theano.function([t_theta, t_img], [st_cpu], mode=mode_without_gpu)
    res, = st_cpu_func(transform, img)
-    img_out_gpu = st_dnn_func()
-    img_out = np.asarray(img_out_gpu[0])
    utt.assert_allclose(img_out, res, rtol=1e-2, atol=1e-2)