Add docstring and refactor spatial transformer to use downsample_factor instead of grid dims

Signed-off-by: João Victor Tozatti Risso <joaovictor.risso@gmail.com>

Add docstring and refactor spatial transformer to use downsample_factor instead of grid dims
d06fcbd8 · João Victor Tozatti Risso · 14766a3d · d06fcbd8 · d06fcbd8 · d06fcbd8
--- a/theano/gpuarray/c_code/spatialtf_desc.c
+++ b/theano/gpuarray/c_code/spatialtf_desc.c
 #section support_code_apply

-int APPLY_SPECIFIC(spatialtf_desc)(cudnnSpatialTransformerDescriptor_t * desc,
+int APPLY_SPECIFIC(spatialtf_desc)(npy_int32 dim_nimages,
+                                   npy_int32 dim_nchannels,
+                                   npy_int32 dim_height,
+                                   npy_int32 dim_width,
+                                   cudnnSpatialTransformerDescriptor_t * desc,
                                   PARAMS_TYPE * params)
 {
    cudnnStatus_t err;

-    if ( params->nimages == 0 || params->nchannels == 0 ||
-         params->height == 0 || params->width == 0 )
+    const int nimages = (int) dim_nimages;
+    const int nchannels = (int) dim_nchannels;
+    const int height = (int) dim_height;
+    const int width = (int) dim_width;
+
+    if ( nimages == 0 || nchannels == 0 || height == 0 || width == 0 )
    {
        PyErr_SetString( PyExc_RuntimeError, "Invalid grid dimensions" );
        return -1;
    }

    // num_images, num_channels, height, width
-    const int out_tensor_dims[4] = { params->nimages, params->nchannels, params->height, params->width };
+    const int out_tensor_dims[4] = { nimages, nchannels, height, width };

    err = cudnnCreateSpatialTransformerDescriptor( desc );
    if ( CUDNN_STATUS_SUCCESS != err )
@@ -30,7 +38,7 @@ int APPLY_SPECIFIC(spatialtf_desc)(cudnnSpatialTransformerDescriptor_t * desc,
        params->dtype, 4, out_tensor_dims );
    if ( CUDNN_STATUS_SUCCESS != err )
    {
-        PyErr_Format( PyExc_MemoryError, 
+        PyErr_Format( PyExc_MemoryError,
            "Failed to initialize spatial transformer descriptor: %s",
            cudnnGetErrorString( err ) );
        return -1;

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2840,9 +2840,8 @@ class GpuDnnSpatialTfDesc(COp):
    operations.
    """

-    __props__ = ('dimensions', 'dtype')
-    params_type = ParamsType(nimages=int_t, nchannels=int_t, height=int_t, width=int_t,
-                             dtype=cudnn.cudnnDataType_t)
+    __props__ = ('dtype',)
+    params_type = ParamsType(dtype=cudnn.cudnnDataType_t)

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -2859,19 +2858,20 @@ class GpuDnnSpatialTfDesc(COp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, dimensions, dtype="float32"):
+    def __init__(self, dtype=theano.config.floatX):
        COp.__init__(self, ["c_code/spatialtf_desc.c"], "APPLY_SPECIFIC(spatialtf_desc)")

-        # cuDNN supports only 2D transformations, therefore output tensor must
-        # have exactly 4 dimensions: (width, height, num_channels, num_images)
-        assert len(dimensions) == 4
-        self.dimensions = tuple(dimensions)
-
        assert cudnn.cudnnDataType_t.has_alias(dtype)
        self.dtype = dtype

-    def make_node(self):
-        node = Apply(self, [],
+    def make_node(self, dimensions):
+        # cuDNN supports only 2D transformations, therefore output tensor must
+        # have exactly 4 dimensions: (num_images, num_channels, height, width)
+        assert len(dimensions) == 4
+        dimensions = tuple(dimensions)
+        nimages, nchannels, height, width = dimensions
+
+        node = Apply(self, [nimages, nchannels, height, width],
                     [CDataType("cudnnSpatialTransformerDescriptor_t",
                                freefunc="cudnnDestroySpatialTransformerDescriptor")()])
        # DebugMode cannot compare the values of CDataType variables, so by
@@ -2882,15 +2882,6 @@ class GpuDnnSpatialTfDesc(COp):
        out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
        return node

-    # Number of images
-    nimages = property(lambda self: self.dimensions[0])
-    # Number of channels
-    nchannels = property(lambda self: self.dimensions[1])
-    # Grid height
-    height = property(lambda self: self.dimensions[2])
-    # Grid width
-    width = property(lambda self: self.dimensions[3])
-
    def c_code_cache_version(self):
        return (super(GpuDnnSpatialTfDesc, self).c_code_cache_version(), version())

@@ -2917,7 +2908,6 @@ class GpuDnnGridGenerator(DnnBase):
        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))

        assert theta.dtype in ('float16', 'float32', 'float64')
-        assert cudnn.cudnnDataType_t.has_alias(theta.dtype)

        # Allocate GPU memory for grid of coordinates
        grid = GpuArrayType(dtype=self.dtype,
@@ -2971,35 +2961,57 @@ class GpuDnnGridSampler(DnnBase):
        pass


-def dnn_spatialtf(img, theta, grid_dims, alpha=None, beta=None, dtype=None):
+def dnn_spatialtf(inp, theta, downsampling_factor=1, alpha=None, beta=None, dtype=theano.config.floatX):
    """
-        GPU spatial transformer using cuDNN from NVIDIA.
+    GPU spatial transformer using cuDNN from NVIDIA.
+
+    Parameters
+    ----------
+    inp : tensor
+        Input feature maps in format NCHW
+        (number of inputs, number of channels, height, width)
+    theta : matrix
+        Affine transformation matrix generated by the localization network.
+    downsample_factor : float
+        A float specifying the downsample factor for the output image (in both
+        spatial dimensions). A value of 1 will keep the original size of the
+        input. Values larger than 1 will downsample the input. Values below 1
+        will upsample the input.
+
+    Returns
+    -------
+    out : tensor
+        Transformed inputs with the shape
+        ``(number of inputs, number of channels, floor(height / downsampling_factor), floor(width / downsampling_factor))``.
+
+    Notes
+    -----
+    cuDNN currently only supports 2D transformations with 2x3 affine
+    transformation matrix. Also, the only sampler available is the
+    bilinear interpolation.
    """

-    # img is a 4D tensor with shape: (num_images, num_channels, width, height)
-    assert img.ndim == 4
-    # Grid dimensions must be a 4-dimensional tuple
-    assert isinstance(grid_dims, tuple)
-    assert len(grid_dims) == 4
+    # inp is a 4D tensor with shape: (num_inputs, num_channels, width, height)
+    assert inp.ndim == 4
    # Theta is an array of transformation matrices and must have shape: (num_images, 2, 3)
    assert theta.ndim == 3

-    img = gpu_contiguous(img)
-    theta = gpu_contiguous(theta)
+    grid_dims = (as_scalar(inp.shape[0]).astype('int32'),
+                 as_scalar(inp.shape[1]).astype('int32'),
+                 as_scalar(inp.shape[2] // downsampling_factor).astype('int32'),
+                 as_scalar(inp.shape[3] // downsampling_factor).astype('int32'))

-    dtype = img.dtype if dtype is None else dtype
+    inp = gpu_contiguous(inp)
+    theta = gpu_contiguous(theta)
+    downsampling_factor = float(downsampling_factor)

    # Create spatial transformer descriptor
-    desc = GpuDnnSpatialTfDesc(grid_dims, dtype)()
-
+    desc = GpuDnnSpatialTfDesc(dtype)(grid_dims)
    # Create grid dimensions variable
    grid_dims_var = as_tensor_variable(grid_dims)
-
-    # Setup grid of coordinates
+    # Setup and return sampling grid
    grid_coord = GpuDnnGridGenerator(dtype)(grid_dims_var, theta, desc)
-
-    grid_sampler = GpuDnnGridSampler(dtype)(img, grid_coord, desc, alpha, beta)
-
+    grid_sampler = GpuDnnGridSampler(dtype)(inp, grid_coord, desc, alpha, beta)
    return grid_sampler



--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2441,11 +2441,9 @@ def test_dnn_spatialtf():
    # Convert from NHWC to NCHW
    img = np.transpose(img, axes=(0, 3, 1, 2)).astype(theano.config.floatX)
    gpu_img = gpuarray_shared_constructor(img)
-
+    # Downsample image dimensions by a factor of 2, i.e. our output tensor will
+    # have shape (n, c, h / 2, w / 2)
    downsample_factor = 2
-    grid_h = img_dims[1] // downsample_factor
-    grid_w = img_dims[2] // downsample_factor
-    grid_dims = (img_dims[0], img_dims[3], grid_h, grid_w)

    # Transformation matrix
    rotation = [[1, 0, 0],
@@ -2454,7 +2452,7 @@ def test_dnn_spatialtf():
    transform = np.asarray(img_dims[0] * [rotation], dtype=theano.config.floatX)
    gpu_transform = gpuarray_shared_constructor(transform)

-    st_dnn = dnn.dnn_spatialtf(gpu_img, gpu_transform, grid_dims)
+    st_dnn = dnn.dnn_spatialtf(gpu_img, gpu_transform, downsample_factor)
    st_dnn_func = theano.function([], [st_dnn])

    # Check if function graph contains the spatial transformer Ops