Merge spatial transformer implementation into a single Op, GpuDnnTransformer

Signed-off-by: João Victor Tozatti Risso <joaovictor.risso@gmail.com>

Merge spatial transformer implementation into a single Op, GpuDnnTransformer
c99cc112 · João Victor Tozatti Risso · 09e362f2 · c99cc112 · 09e362f2 · c99cc112
--- a/theano/gpuarray/c_code/dnn_sptf.c
+++ b/theano/gpuarray/c_code/dnn_sptf.c
--- a/theano/gpuarray/c_code/spatialtf_grid.c
+++ b/theano/gpuarray/c_code/spatialtf_grid.c
-#section support_code
-int
-spatialtf_grid(PyArrayObject * grid_dimensions,
-               PyGpuArrayObject * theta,
-               cudnnSpatialTransformerDescriptor_t desc,
-               PyGpuArrayObject ** grid,
-               cudnnHandle_t _handle)
-{
-    PyGpuContextObject * gpu_ctx = theta->context;
-    cudnnStatus_t err;
-    if ( theta->ga.typecode != GA_FLOAT &&
-         theta->ga.typecode != GA_DOUBLE &&
-         theta->ga.typecode != GA_HALF )
-    {
-        PyErr_SetString( PyExc_TypeError, "Unsupported data type for theta" );
-        return -1;
-    }
-    if ( PyGpuArray_NDIM( theta ) != 3 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "theta must have three dimensions!" );
-        return -1;
-    }
-    if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "Incorrect dimensions for theta, should be (%d, %d, %d), got (%d, %d, %d)",
-                      PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
-                      PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
-        return -1;
-    }
-    if ( PyArray_DIM( grid_dimensions, 0 ) != 4 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "grid_dimensions must have 4 dimensions!" );
-        return -1;
-    }
-    // Obtain grid dimensions
-    const size_t num_images = (size_t) *( (npy_int *) PyArray_GETPTR1( grid_dimensions, 0 ) );
-    // Dimension 1 is the number of image channels
-    const size_t height = (size_t) *( (npy_int *) PyArray_GETPTR1( grid_dimensions, 2 ) );
-    const size_t width = (size_t) *( (npy_int *) PyArray_GETPTR1( grid_dimensions, 3 ) );
-    // Grid of coordinates is of size num_images * height * width * 2 for a 2D transformation
-    const size_t grid_dims[4] = { num_images, height, width, 2 };
-    if ( width == 0 || height == 0 || num_images == 0 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "One of the grid dimensions is zero" );
-        return -1;
-    }
-    if ( NULL == *grid ||
-         ! theano_size_check( *grid, 4, grid_dims, (*grid)->ga.typecode ) )
-    {
-        Py_XDECREF( *grid );
-        *grid = pygpu_empty( 4, grid_dims, theta->ga.typecode, GA_C_ORDER,
-            gpu_ctx, Py_None );
-        if ( NULL == *grid )
-        {
-            PyErr_SetString( PyExc_MemoryError,
-                             "Could not allocate memory for grid of coordinates" );
-            return -1;
-        }
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &(theta->ga) ) )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-                         "theta data is not C-contiguous" );
-        return -1;
-    }
-    if ( ! GpuArray_IS_C_CONTIGUOUS( &((*grid)->ga) ) )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-                         "grid data is not C-contiguous" );
-        return -1;
-    }
-    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    const void * theta_data = PyGpuArray_DEV_DATA( theta );
-    void * grid_data = PyGpuArray_DEV_DATA( *grid );
-    err = cudnnSpatialTfGridGeneratorForward( _handle, desc, theta_data, grid_data );
-    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-                      "Failed to create grid of coordinates: %s",
-                      cudnnGetErrorString( err ) );
-        return -1;
-    }
-    return 0;
-}
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2833,7 +2833,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]
-class GpuDnnSpatialTfDesc(COp):
+class _GpuDnnTransformerDescriptor(COp):
    """
    This Op builds a spatial transformer descriptor for use in spatial transformer network
@@ -2859,13 +2859,13 @@ class GpuDnnSpatialTfDesc(COp):
        return False
    def __init__(self, dtype=theano.config.floatX):
-        COp.__init__(self, ["c_code/spatialtf_desc.c"], "APPLY_SPECIFIC(spatialtf_desc)")
+        COp.__init__(self, ["c_code/dnn_sptf_desc.c"], "APPLY_SPECIFIC(dnn_sptf_desc)")
        assert cudnn.cudnnDataType_t.has_alias(dtype)
        self.dtype = dtype
    def make_node(self, dimensions):
-        # cuDNN supports only 2D transformations, therefore output tensor must
+        # cuDNN supports only 2D transformations, and the output tensor must
        # have exactly 4 dimensions: (num_images, num_channels, height, width)
        assert len(dimensions) == 4
        dimensions = tuple(dimensions)
@@ -2883,63 +2883,31 @@ class GpuDnnSpatialTfDesc(COp):
        return node
    def c_code_cache_version(self):
-        return (super(GpuDnnSpatialTfDesc, self).c_code_cache_version(), version())
+        return (super(_GpuDnnTransformerDescriptor, self).c_code_cache_version(), version())
-class GpuDnnGridGenerator(DnnBase):
+class GpuDnnTransformer(DnnBase):
    """
-    This Op builds a spatial transformer grid generator for use in spatial transformer network
+    This Op builds a spatial transformer that can be used in spatial transformer networks.
-    operations.
    """
    __props__ = ('dtype',)
-    _cop_num_inputs = 3
+    _cop_num_inputs = 6
    _cop_num_outputs = 1
+    _f16_ok = True
    def __init__(self, dtype):
-        DnnBase.__init__(self, ["c_code/spatialtf_grid.c"], "spatialtf_grid")
+        DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "dnn_sptf")
        self.dtype = dtype
-    def make_node(self, grid_dimensions, theta, desc):
+    def make_node(self, img, theta, grid_dims, desc, alpha=None, beta=None):
-        context_name = infer_context_name(desc, theta)
-        grid_dimensions = as_tensor_variable(grid_dimensions)
-        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
        assert theta.dtype in ('float16', 'float32', 'float64')
-        # Allocate GPU memory for grid of coordinates
+        context_name = infer_context_name(img)
-        grid = GpuArrayType(dtype=self.dtype,
-                            broadcastable=(False, False, False, False,),
-                            context_name=context_name)()
-        return Apply(self, [grid_dimensions, theta, desc], [grid])
-    def L_op(self, inputs, outputs, output_grads):
-        pass
-class GpuDnnGridSampler(DnnBase):
-    """
-    This Op builds a spatial transformer grid sampler for use in spatial transformer network
-    operations.
-    """
-    __props__ = ('dtype',)
-    _cop_num_inputs = 5
-    _cop_num_outputs = 1
-    def __init__(self, dtype):
-        DnnBase.__init__(self, ["c_code/spatialtf_sampler.c"], "spatialtf_sampler")
-        self.dtype = dtype
-    def make_node(self, img, grid, desc, alpha=None, beta=None):
-        context_name = infer_context_name(img, grid)
+        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
        img = as_gpuarray_variable(img, context_name)
-        grid = as_gpuarray_variable(grid, context_name)
+        grid_dims = as_tensor_variable(grid_dims)
        output = GpuArrayType(dtype=self.dtype,
                              broadcastable=img.type.ndim * (False,),
@@ -2955,9 +2923,9 @@ class GpuDnnGridSampler(DnnBase):
        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
        beta = ensure_dt(beta, _zero, 'beta', img.dtype)
-        return Apply(self, [img, grid, desc, alpha, beta], [output])
+        return Apply(self, [img, theta, grid_dims, desc, alpha, beta], [output])
-    def L_op(self, inputs, outputs, output_grads):
+    def L_op(self, inputs, outputs, grads):
        pass
@@ -3011,13 +2979,12 @@ def dnn_spatialtf(inp, theta, scale_width=1, scale_height=1, alpha=None, beta=No
    theta = gpu_contiguous(theta)
    # Create spatial transformer descriptor
-    desc = GpuDnnSpatialTfDesc(dtype)(grid_dims)
+    desc = _GpuDnnTransformerDescriptor(dtype)(grid_dims)
    # Create grid dimensions variable
    grid_dims_var = as_tensor_variable(grid_dims)
-    # Setup and return sampling grid
+    # Setup spatial transformer
-    grid_coord = GpuDnnGridGenerator(dtype)(grid_dims_var, theta, desc)
+    transformer = GpuDnnTransformer(dtype)(inp, theta, grid_dims_var, desc, alpha, beta)
-    grid_sampler = GpuDnnGridSampler(dtype)(inp, grid_coord, desc, alpha, beta)
+    return transformer
-    return grid_sampler
 @local_optimizer([AbstractConv2d, AbstractConv3d])

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2440,7 +2440,6 @@ def test_dnn_spatialtf():
    img = np.random.randint(low=0, high=256, size=img_dims)
    # Convert from NHWC to NCHW
    img = np.transpose(img, axes=(0, 3, 1, 2)).astype(theano.config.floatX)
-    gpu_img = gpuarray_shared_constructor(img)
    # Downsample image dimensions by a factor of 2, i.e. our output tensor will
    # have shape (n, c, h / 2, w / 2)
    scale_height = 0.25
@@ -2451,25 +2450,25 @@ def test_dnn_spatialtf():
             [0, -1, 0]]
    transform = np.asarray(img_dims[0] * [theta], dtype=theano.config.floatX)
-    gpu_transform = gpuarray_shared_constructor(transform)
-    st_dnn = dnn.dnn_spatialtf(gpu_img, gpu_transform, scale_height=scale_height,
+    # Create symbolic variables for inputs and transformations
+    t_img = T.tensor4('img')
+    t_theta = T.tensor3('theta')
+    st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height,
                               scale_width=scale_width)
-    st_dnn_func = theano.function([], [st_dnn])
+    st_dnn_func = theano.function([t_img, t_theta], [st_dnn])
+    img_out_gpu, = st_dnn_func(img, transform)
+    img_out = np.asarray(img_out_gpu)
    # Check if function graph contains the spatial transformer Ops
    topo = st_dnn_func.maker.fgraph.toposort()
-    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnGridGenerator)]) == 1
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnTransformer)]) == 1
-    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnGridSampler)]) == 1
    # Setup CPU Op
-    t_img = T.tensor4('img')
-    t_theta = T.tensor3('theta')
    st_cpu = spatialtf_cpu(t_theta, t_img, scale_height, scale_width, 'nearest')
    st_cpu_func = theano.function([t_theta, t_img], [st_cpu], mode=mode_without_gpu)
    res, = st_cpu_func(transform, img)
-    img_out_gpu = st_dnn_func()
-    img_out = np.asarray(img_out_gpu[0])
    utt.assert_allclose(img_out, res, rtol=1e-2, atol=1e-2)