Split spatial transformer implementation into grid and sampler Ops

Signed-off-by: João Victor Risso <joaovictor.risso@gmail.com>

Split spatial transformer implementation into grid and sampler Ops
c25df18f · João Victor Risso · 659b7c8f · c25df18f · c25df18f · c25df18f
--- a/theano/gpuarray/c_code/dnn_sptf_grid.c
+++ b/theano/gpuarray/c_code/dnn_sptf_grid.c
+#section support_code_struct
+int
+APPLY_SPECIFIC(dnn_sptf_grid)(PyGpuArrayObject * theta,
+                              PyArrayObject * grid_dims,
+                              cudnnSpatialTransformerDescriptor_t desc,
+                              PyGpuArrayObject ** grid,
+                              cudnnHandle_t _handle)
+{
+    PyGpuContextObject * gpu_ctx = theta->context;
+    size_t gpu_grid_dims[4];
+    int num_images, num_channels, height, width;
+    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+    if ( theta->ga.typecode != GA_FLOAT &&
+         theta->ga.typecode != GA_DOUBLE &&
+         theta->ga.typecode != GA_HALF )
+    {
+        PyErr_SetString( PyExc_TypeError,
+            "GpuDnnTransformerGrid: unsupported data type for theta in spatial transformer." );
+        return 1;
+    }
+    else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerGrid: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
+            PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
+            PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
+        return 1;
+    }
+    if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 )
+    {
+        PyErr_SetString( PyExc_MemoryError,
+            "GpuDnnTransformerGrid: grid_dims must have 4 elements." );
+        return 1;
+    }
+    // Obtain grid dimensions
+    num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 0 ) );
+    height = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 2 ) );
+    width = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 3 ) );
+    gpu_grid_dims[0] = num_images;
+    gpu_grid_dims[1] = height;
+    gpu_grid_dims[2] = width;
+    gpu_grid_dims[3] = 2;
+    if ( theano_prep_output( grid, 4, gpu_grid_dims, theta->ga.typecode,
+                             GA_C_ORDER, gpu_ctx ) != 0 )
+    {
+        PyErr_SetString( PyExc_RuntimeError,
+            "GpuDnnTransformerGrid: could not allocate memory for grid of coordinates" );
+        return 1;
+    }
+    cuda_enter( gpu_ctx->ctx );
+    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
+        PyGpuArray_DEV_DATA( *grid ) );
+    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
+    cuda_exit( gpu_ctx->ctx );
+    if ( CUDNN_STATUS_SUCCESS != err )
+    {
+        PyErr_Format( PyExc_RuntimeError,
+            "GpuDnnTransformerGrid: could not create grid of coordinates: %s",
+            cudnnGetErrorString( err ) );
+        return 1;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/theano/gpuarray/c_code/dnn_sptf.c
+++ b/theano/gpuarray/c_code/dnn_sptf.c
@@ -14,7 +14,7 @@ APPLY_SPECIFIC(ydesc) = NULL;
    if ( err != CUDNN_STATUS_SUCCESS )
    {
        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
+            "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor xdesc: %s",
            cudnnGetErrorString( err ) );
        FAIL;
    }
@@ -23,7 +23,7 @@ APPLY_SPECIFIC(ydesc) = NULL;
    if ( err != CUDNN_STATUS_SUCCESS )
    {
        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor ydesc: %s",
+            "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor ydesc: %s",
            cudnnGetErrorString( err ) );
        FAIL;
    }
@@ -40,22 +40,19 @@ if ( APPLY_SPECIFIC(ydesc) != NULL )
 #section support_code_struct
 int
-APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
+APPLY_SPECIFIC(dnn_sptf_sampler)(PyGpuArrayObject * input,
-                         PyGpuArrayObject * theta,
+                                 PyGpuArrayObject * grid,
-                         PyArrayObject * grid_dims,
+                                 cudnnSpatialTransformerDescriptor_t desc,
-                         cudnnSpatialTransformerDescriptor_t desc,
+                                 PyGpuArrayObject ** output,
-                         PyGpuArrayObject ** output,
+                                 cudnnHandle_t _handle)
-                         PyGpuArrayObject ** grid,
-                         cudnnHandle_t _handle)
 {
    PyGpuContextObject * gpu_ctx = input->context;
    void * alpha_p;
    void * beta_p;
    double alpha = 1.0, beta = 0.0;
    float af = alpha, bf = beta;
+    size_t out_dims[4];
    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    int num_images, num_channels, height, width;
-    size_t gpu_grid_dims[4], out_dims[4];
    switch (input->ga.typecode)
    {
@@ -77,81 +74,23 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
        return 1;
    }
-    if ( theta->ga.typecode != GA_FLOAT &&
+    out_dims[0] = (size_t) PyGpuArray_DIM(input, 0); // num_images
-         theta->ga.typecode != GA_DOUBLE &&
+    out_dims[1] = (size_t) PyGpuArray_DIM(input, 1); // num_channels
-         theta->ga.typecode != GA_HALF )
+    out_dims[2] = (size_t) PyGpuArray_DIM(grid, 1); // grid width
-    {
+    out_dims[3] = (size_t) PyGpuArray_DIM(grid, 2); // grid height
-        PyErr_SetString( PyExc_TypeError,
-            "GpuDnnTransformer: unsupported data type for theta in spatial transformer." );
-        return 1;
-    }
-    else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformer: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
-            PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
-            PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
-        return 1;
-    }
-    if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 )
+    if ( out_dims[0] == 0 || out_dims[1] == 0 || out_dims[2] == 0 || out_dims[3] == 0 )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-            "GpuDnnTransformer: grid_dims must have 4 elements." );
-        return 1;
-    }
-    // Obtain grid dimensions
-    num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 0 ) );
-    num_channels = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 1 ) );
-    height = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 2 ) );
-    width = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 3 ) );
-    gpu_grid_dims[0] = num_images;
-    gpu_grid_dims[1] = height;
-    gpu_grid_dims[2] = width;
-    gpu_grid_dims[3] = 2;
-    out_dims[0] = num_images;
-    out_dims[1] = num_channels;
-    out_dims[2] = height;
-    out_dims[3] = width;
-    if ( width == 0 || height == 0 || num_images == 0 )
    {
        PyErr_SetString( PyExc_RuntimeError,
-            "GpuDnnTransformer: grid_dims has a dimension with value zero" );
+            "GpuDnnTransformerSampler: one of the sampler dimensions is zero" );
-        return 1;
+        return 1; 
-    }
-    if ( PyGpuArray_DIM( input, 0 ) != num_images )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformer: expected batch size %d, got %d.",
-            num_images, PyGpuArray_DIM( input, 0 ) );
-        return 1;
-    }
-    else if ( PyGpuArray_DIM( input, 1 ) != num_channels )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformer: expected input to have %d channels, got %d channels.",
-            num_channels, PyGpuArray_DIM( input, 1 ) );
-        return 1;
-    }
-    if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
-                             GA_C_ORDER, gpu_ctx ) != 0 )
-    {
-        PyErr_SetString( PyExc_RuntimeError,
-            "GpuDnnTransformer: could not allocate memory for grid of coordinates" );
-        return 1;
    }
    if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
                             GA_C_ORDER, gpu_ctx ) != 0 )
    {
        PyErr_SetString( PyExc_MemoryError,
-            "GpuDnnTransformer: could not allocate memory for grid sampler" );
+            "GpuDnnTransformerSampler: could not allocate memory for grid sampler" );
        return 1;
    }
@@ -164,29 +103,15 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
    cuda_enter( gpu_ctx->ctx );
    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
    cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
-        PyGpuArray_DEV_DATA( *grid ) );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformer: could not create grid of coordinates: %s",
-            cudnnGetErrorString( err ) );
-        cuda_exit( gpu_ctx->ctx );
-        return 1;
-    }
    err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, APPLY_SPECIFIC(xdesc),
-        PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p,
+        PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( grid ), beta_p,
        APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
+    cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
    cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
    cuda_exit( gpu_ctx->ctx );
@@ -194,10 +119,10 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
    if ( CUDNN_STATUS_SUCCESS != err )
    {
        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformer: could not create grid sampler: %s",
+            "GpuDnnTransformerSampler: could not create grid sampler: %s",
            cudnnGetErrorString( err ) );
        return 1;
    }
    return 0;
 }
\ No newline at end of file
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2884,34 +2884,22 @@ class GpuDnnTransformerDesc(COp):
        return (super(GpuDnnTransformerDesc, self).c_code_cache_version(), version())
-class GpuDnnTransformer(DnnBase):
+class GpuDnnTransformerGrid(DnnBase):
-    """
-    Spatial transformer that can be used in spatial transformer networks, it
-    implements the grid generator and sampler. The localization network can
-    be built using neural net components of Theano.
-    """
    __props__ = ()
-    _cop_num_inputs = 4
+    _cop_num_inputs = 3
-    _cop_num_outputs = 2
+    _cop_num_outputs = 1
    _f16_ok = True
-    default_output = 0
    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "APPLY_SPECIFIC(dnn_sptf)")
+        DnnBase.__init__(self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")
-    def make_node(self, img, theta, desc):
+    def make_node(self, theta, desc):
        context_name = infer_context_name(desc)
        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnSpatialTransformerDescriptor_t'):
            raise ValueError('desc must be cudnnSpatialTransformerDescriptor_t')
-        img = gpu_contiguous(as_gpuarray_variable(img, context_name))
-        if img.type.ndim != 4:
-            raise TypeError('img must be a 4D tensor')
-        elif img.dtype not in ('float16', 'float32', 'float64'):
-            raise TypeError('img type must be floating-point')
        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
        assert theta.dtype in ('float16', 'float32', 'float64')
        assert theta.ndim == 3
@@ -2922,33 +2910,72 @@ class GpuDnnTransformer(DnnBase):
        assert grid_dims.ndim == 1
        # Ensure 64-bit ints are passed to the C code
        grid_dims = theano.tensor.basic.cast(grid_dims, 'int64')
+        grid = GpuArrayType(dtype=theta.dtype,
+                            broadcastable=(theta.type.ndim + 1) * (False,),
+                            context_name=context_name)()
-        output = GpuArrayType(dtype=img.dtype,
+        inputs = [theta, grid_dims, desc]
-                              broadcastable=img.type.ndim * (False,),
+        outputs = [grid]
-                              context_name=context_name)()
+        return Apply(self, inputs, outputs)
-        grid = GpuArrayType(dtype=img.dtype,
+    def grad(self, inputs, grads):
-                            broadcastable=img.type.ndim * (False,),
+        theta, grid_dims, desc = inputs
-                            context_name=context_name)()
+        dgrid = grads[0]
+        dtheta = GpuDnnTransformerGradT()(dgrid, desc)
+        return [dtheta, grad_not_implemented(self, 1, grid_dims), DisconnectedType()()]
-        inputs = [img, theta, grid_dims, desc]
+    def connection_pattern(self, node):
-        outputs = [output, grid]
+        # not connected to desc
+        return [[1], [1], [0]]
+class GpuDnnTransformerSampler(DnnBase):
+    __props__ = ()
+    _cop_num_inputs = 3
+    _cop_num_outputs = 1
+    _f16_ok = True
+    def __init__(self):
+        DnnBase.__init__(self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)")
+    def make_node(self, img, grid, desc):
+        context_name = infer_context_name(desc)
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnSpatialTransformerDescriptor_t'):
+            raise ValueError('desc must be cudnnSpatialTransformerDescriptor_t')
+        img = gpu_contiguous(as_gpuarray_variable(img, context_name))
+        if img.type.ndim != 4:
+            raise TypeError('img must be a 4D tensor')
+        elif img.dtype not in ('float16', 'float32', 'float64'):
+            raise TypeError('img type must be floating-point')
+        grid = gpu_contiguous(as_gpuarray_variable(grid, context_name))
+        if grid.type.ndim != 4:
+            raise TypeError('grid must be a 4D tensor')
+        elif grid.dtype not in ('float16', 'float32', 'float64'):
+            raise TypeError('grid type must be floating-point')
+        out = GpuArrayType(dtype=img.dtype,
+                           broadcastable=img.type.ndim * (False,),
+                           context_name=context_name)()
+        inputs = [img, grid, desc]
+        outputs = [out]
        return Apply(self, inputs, outputs)
-    def L_op(self, inputs, outputs, grads):
+    def grad(self, inputs, grads):
-        img, _, grid_dims, desc = inputs
+        img, grid, desc = inputs
-        _, grid = outputs
        dy = grads[0]
        dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy, desc)
-        dtheta = GpuDnnTransformerGradT()(dgrid, desc)
+        return [dimg, dgrid, DisconnectedType()()]
-        dgrid_dims = grad_not_implemented(self, grid_dims, 2)
-        return [dimg, dtheta, dgrid_dims, DisconnectedType()()]
    def connection_pattern(self, node):
        # not connected to desc
-        return [[1, 1], [1, 1], [1, 1], [0, 0]]
+        return [[1], [1], [0]]
 class GpuDnnTransformerGradI(DnnBase):
@@ -3096,8 +3123,9 @@ def dnn_spatialtf(img, theta, scale_width=1, scale_height=1, precision=theano.co
    assert theta.ndim == 3
    # Setup spatial transformer
-    transformer = GpuDnnTransformer()(img, theta, desc)
+    grid = GpuDnnTransformerGrid()(theta, desc)
-    return transformer
+    sampler = GpuDnnTransformerSampler()(img, grid, desc)
+    return sampler
 @local_optimizer([AbstractConv2d, AbstractConv3d])

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2456,8 +2456,10 @@ def test_dnn_spatialtf():
    st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height,
                               scale_width=scale_width)
    st_dnn_func = theano.function([t_img, t_theta], st_dnn)
-    # Check if function graph contains the spatial transformer Op
+    # Check if function graph contains the spatial transformer's grid and sampler Ops
-    assert any([isinstance(node.op, dnn.GpuDnnTransformer)
+    assert any([isinstance(node.op, dnn.GpuDnnTransformerGrid)
+                for node in st_dnn_func.maker.fgraph.toposort()])
+    assert any([isinstance(node.op, dnn.GpuDnnTransformerSampler)
                for node in st_dnn_func.maker.fgraph.toposort()])
    img_out_gpu = st_dnn_func(img, transform)
@@ -2508,21 +2510,3 @@ def test_dnn_spatialtf_grad():
    assert any([isinstance(node.op, dnn.GpuDnnTransformerGradT)
                for node in grad_fn.maker.fgraph.toposort()])
-    # Verify grad wrt input
-    def functor_wrt_i(input):
-        desc = dnn.GpuDnnTransformerDesc(theano.config.floatX)(out_shp)
-        transformed_input = dnn.GpuDnnTransformer()(input, theta, desc)
-        grad = T.grad(T.mean(transformed_input), input)
-        return grad
-    # Verify grad wrt theta
-    def functor_wrt_t(theta):
-        desc = dnn.GpuDnnTransformerDesc(theano.config.floatX)(out_shp)
-        transformed_input = dnn.GpuDnnTransformer()(img, theta, out, desc)
-        grad = T.grad(T.mean(transformed_input), theta)
-        return grad
-    utt.verify_grad(functor_wrt_i, [img])
-    utt.verify_grad(functor_wrt_t, [theta])
\ No newline at end of file