提交 6d71e317 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6061 from joaovictortr/spatialtf_networks

Spatial Transformer Network Ops using cuDNN
......@@ -146,6 +146,8 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
- Softmax:
- You can manually use the op :class:`GpuDnnSoftmax
<theano.gpuarray.dnn.GpuDnnSoftmax>` to use its extra feature.
- Spatial Transformer:
- :func:`theano.gpuarray.dnn.dnn_spatialtf`.
List of Implemented Operations
==============================
......
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dxdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(dydesc);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
APPLY_SPECIFIC(xdesc) = NULL;
APPLY_SPECIFIC(dxdesc) = NULL;
APPLY_SPECIFIC(dydesc) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
if (err != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerGradI: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dxdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dxdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dydesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dydesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL)
cudnnDestroySpatialTransformerDescriptor( APPLY_SPECIFIC(sptf) );
if ( APPLY_SPECIFIC(xdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
if ( APPLY_SPECIFIC(dxdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dxdesc) );
if ( APPLY_SPECIFIC(dydesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dydesc) );
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_gi)(PyGpuArrayObject * input,
PyGpuArrayObject * grid,
PyGpuArrayObject * dy,
PyGpuArrayObject ** input_grad,
PyGpuArrayObject ** grid_grad,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p;
void * beta_p;
double alpha = 1.0, beta = 0.0;
float af = alpha, bf = beta;
int out_dims[4];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch (input->ga.typecode)
{
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGradI: unsupported type for input in spatial transformer gradients" );
return 1;
}
if ( grid->ga.typecode != GA_FLOAT &&
grid->ga.typecode != GA_DOUBLE &&
grid->ga.typecode != GA_HALF )
{
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGradI: unsupported data type for grid in spatial transformer gradients." );
return 1;
}
if ( theano_prep_output( input_grad, PyGpuArray_NDIM( input ),
PyGpuArray_DIMS( input ), input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
return 1;
if ( theano_prep_output( grid_grad, PyGpuArray_NDIM( grid ),
PyGpuArray_DIMS( grid ), grid->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
return 1;
// Obtain output dimensions to setup descriptor
out_dims[0] = (int) PyGpuArray_DIM(input, 0); // num_images
out_dims[1] = (int) PyGpuArray_DIM(input, 1); // num_channels
out_dims[2] = (int) PyGpuArray_DIM(grid, 1); // grid height
out_dims[3] = (int) PyGpuArray_DIM(grid, 2); // grid width
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, out_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: could not initialize descriptor (sptf): %s",
cudnnGetErrorString( err ) );
return 1;
}
if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
return 1;
if ( c_set_tensorNd( dy, APPLY_SPECIFIC(dydesc) ) != 0 )
return 1;
if ( c_set_tensorNd( *input_grad, APPLY_SPECIFIC(dxdesc) ) != 0 )
return 1;
// Directly return the outputs if any of the dimensions is 0.
// (cuDNN does not support zero-length dimensions.)
if ( PyGpuArray_SIZE( *input_grad ) == 0 || PyGpuArray_SIZE( *grid_grad ) == 0 )
return 0;
cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_wait( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfSamplerBackward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), beta_p,
APPLY_SPECIFIC(dxdesc), PyGpuArray_DEV_DATA( *input_grad ), alpha_p,
APPLY_SPECIFIC(dydesc), PyGpuArray_DEV_DATA( dy ), PyGpuArray_DEV_DATA( grid ),
beta_p, PyGpuArray_DEV_DATA( *grid_grad ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_record( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGradI: failed to compute gradients of the inputs: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerGrid: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL) { cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf)); }
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_grid)(PyGpuArrayObject * theta,
PyArrayObject * out_dims,
PyGpuArrayObject ** grid,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = theta->context;
size_t grid_dims[4];
int num_images, num_channels, height, width;
int desc_dims[4];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch(theta->ga.typecode)
{
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGrid: unsupported data type for theta in spatial transformer." );
return 1;
}
if ( PyGpuArray_DIM( theta, 1 ) != 2 || PyGpuArray_DIM( theta, 2 ) != 3 )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGrid: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
return 1;
}
if ( PyArray_NDIM( out_dims ) != 1 || PyArray_SIZE( out_dims ) != 4 )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformerGrid: out_dims must have 4 elements." );
return 1;
}
// Obtain output dimensions
num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 0 ) );
num_channels = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 1 ) );
height = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 2 ) );
width = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 3 ) );
// Set transformed output dimensions to setup the descriptor
desc_dims[0] = num_images;
desc_dims[1] = num_channels;
desc_dims[2] = height;
desc_dims[3] = width;
// Set sampling grid dimensions
grid_dims[0] = num_images;
grid_dims[1] = height;
grid_dims[2] = width;
grid_dims[3] = 2;
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, desc_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
cudnnGetErrorString( err ) );
return 1;
}
if ( theano_prep_output( grid, 4, grid_dims, theta->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformerGrid: could not allocate memory for grid of coordinates" );
return 1;
}
cuda_enter( gpu_ctx->ctx );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorForward( _handle, APPLY_SPECIFIC(sptf),
PyGpuArray_DEV_DATA( theta ), PyGpuArray_DEV_DATA( *grid ) );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGrid: could not create grid of coordinates: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerGradT: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL)
cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_gt)(PyGpuArrayObject * dgrid,
PyGpuArrayObject ** dtheta,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = dgrid->context;
int num_images, height, width;
int desc_dims[4];
size_t dtheta_dims[3];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch(dgrid->ga.typecode)
{
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGradT: unsupported data type for dgrid in spatial transformer." );
return 1;
}
num_images = (int) PyGpuArray_DIM( dgrid, 0 );
height = (int) PyGpuArray_DIM( dgrid, 1 );
width = (int) PyGpuArray_DIM( dgrid, 2 );
dtheta_dims[0] = num_images;
dtheta_dims[1] = 2;
dtheta_dims[2] = 3;
if ( theano_prep_output( dtheta, 3, dtheta_dims, dgrid->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
return 1;
desc_dims[0] = num_images;
// Assume number of channels is 1, because the information is not
// available or relevant here
desc_dims[1] = 1;
desc_dims[2] = height;
desc_dims[3] = width;
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, desc_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
cudnnGetErrorString( err ) );
return 1;
}
cuda_enter( gpu_ctx->ctx );
cuda_wait( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorBackward( _handle, APPLY_SPECIFIC(sptf),
PyGpuArray_DEV_DATA( dgrid ), PyGpuArray_DEV_DATA( *dtheta ) );
cuda_record( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGradT: could not compute gradients of the affine transformation: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
#section support_code_struct
cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
cudnnTensorDescriptor_t APPLY_SPECIFIC(ydesc);
#section init_code_struct
APPLY_SPECIFIC(sptf) = NULL;
APPLY_SPECIFIC(xdesc) = NULL;
APPLY_SPECIFIC(ydesc) = NULL;
{
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
if (err != CUDNN_STATUS_SUCCESS)
{
PyErr_Format(PyExc_MemoryError,
"GpuDnnTransformerSampler: could not allocate spatial transformer descriptor (sptf): %s",
cudnnGetErrorString( err ));
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(ydesc) );
if ( err != CUDNN_STATUS_SUCCESS )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor ydesc: %s",
cudnnGetErrorString( err ) );
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(sptf) != NULL)
cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
if ( APPLY_SPECIFIC(xdesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
if ( APPLY_SPECIFIC(ydesc) != NULL )
cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(ydesc) );
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_sampler)(PyGpuArrayObject * input,
PyGpuArrayObject * grid,
PyGpuArrayObject ** output,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p;
void * beta_p;
double alpha = 1.0, beta = 0.0;
float af = alpha, bf = beta;
size_t out_dims[4];
int desc_dims[4];
cudnnDataType_t dt;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
switch (input->ga.typecode)
{
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
dt = CUDNN_DATA_DOUBLE;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_FLOAT;
break;
case GA_HALF:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported type for input in spatial transformer." );
return 1;
}
out_dims[0] = (size_t) PyGpuArray_DIM(input, 0); // num_images
out_dims[1] = (size_t) PyGpuArray_DIM(input, 1); // num_channels
out_dims[2] = (size_t) PyGpuArray_DIM(grid, 1); // grid height
out_dims[3] = (size_t) PyGpuArray_DIM(grid, 2); // grid width
// Set output dimensions for the descriptor setup
desc_dims[0] = (int) out_dims[0];
desc_dims[1] = (int) out_dims[1];
desc_dims[2] = (int) out_dims[2];
desc_dims[3] = (int) out_dims[3];
if ( out_dims[0] == 0 || out_dims[1] == 0 || out_dims[2] == 0 || out_dims[3] == 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformerSampler: one of the sampler dimensions is zero" );
return 1;
}
if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformerSampler: could not allocate memory for grid sampler" );
return 1;
}
// Currently, only the bilinear sampler is supported by cuDNN,
// so the sampler method is currently not available as a parameter
err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
dt, 4, desc_dims );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerSampler: could not initialize descriptor: %s",
cudnnGetErrorString( err ) );
return 1;
}
if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
return 1;
if ( c_set_tensorNd( *output, APPLY_SPECIFIC(ydesc) ) != 0 )
return 1;
cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfSamplerForward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( grid ),
beta_p, APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerSampler: could not create grid sampler: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
......@@ -13,6 +13,7 @@ from theano import Op, Apply, tensor, config, Variable
from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
int32 as int_t, bool as bool_t, uint32 as uint32_t)
from theano.tensor import as_tensor_variable, Argmax
from theano.tensor.extra_ops import cpu_contiguous
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler
......@@ -2782,6 +2783,235 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
return result
class GpuDnnTransformerGrid(DnnBase):
"""
Grid generator Op for cuDNN Spatial Transformer.
"""
__props__ = ()
_cop_num_inputs = 2
_cop_num_outputs = 1
_f16_ok = True
check_input = False
def __init__(self):
DnnBase.__init__(self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")
def make_node(self, theta, out_dims):
"""
Create a grid generator node for a cuDNN Spatial Transformer
Parameters
----------
theta : tensor
Affine transformation tensor containing one affine transformation
matrix per image. ``theta`` is usually generated by the localization
network.
out_dims : tuple
Dimensions of the transformed inputs, containing four elements, and is given
by (N, C, H, W), where N is the number of inputs, C the number of channels,
H and W are the height and width of each input.
"""
context_name = infer_context_name(theta)
theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
assert theta.dtype in ('float16', 'float32', 'float64')
assert theta.ndim == 3
out_dims = cpu_contiguous(as_tensor_variable(out_dims))
assert out_dims.dtype in theano.tensor.basic.integer_dtypes
assert out_dims.ndim == 1
# Ensure 64-bit ints are passed to the C code
out_dims = theano.tensor.basic.cast(out_dims, 'int64')
grid = GpuArrayType(dtype=theta.dtype,
broadcastable=(theta.type.ndim + 1) * (False,),
context_name=context_name)()
inputs = [theta, out_dims]
outputs = [grid]
return Apply(self, inputs, outputs)
def grad(self, inputs, grads):
theta, out_dims = inputs
dgrid = grads[0]
dtheta = GpuDnnTransformerGradT()(dgrid)
return [dtheta, grad_not_implemented(self, 1, out_dims)]
class GpuDnnTransformerSampler(DnnBase):
"""
Grid sampler Op for cuDNN Spatial Transformer.
"""
__props__ = ()
_cop_num_inputs = 2
_cop_num_outputs = 1
_f16_ok = True
check_input = False
def __init__(self):
DnnBase.__init__(self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)")
def make_node(self, img, grid):
"""
Create a grid sampler node for a cuDNN Spatial Transformer
Parameters
----------
img : tensor
Images from which the pixels will be sampled. The implementation
assumes the tensor is in NCHW format, where N is the number of images,
C is the number of color channels, H is the height of the inputs, and
W is width of the inputs.
grid : GpuDnnTransformerGrid
Grid that contains the coordinates of the pixels to be sampled from
the inputs images.
"""
context_name = infer_context_name(img, grid)
img = gpu_contiguous(as_gpuarray_variable(img, context_name))
if img.type.ndim != 4:
raise TypeError('img must be a 4D tensor')
elif img.dtype not in ('float16', 'float32', 'float64'):
raise TypeError('img type must be floating-point')
grid = gpu_contiguous(as_gpuarray_variable(grid, context_name))
if grid.type.ndim != 4:
raise TypeError('grid must be a 4D tensor')
elif grid.dtype not in ('float16', 'float32', 'float64'):
raise TypeError('grid type must be floating-point')
out = GpuArrayType(dtype=img.dtype,
broadcastable=img.type.ndim * (False,),
context_name=context_name)()
inputs = [img, grid]
outputs = [out]
return Apply(self, inputs, outputs)
def grad(self, inputs, grads):
img, grid = inputs
dy = grads[0]
dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy)
return [dimg, dgrid]
class GpuDnnTransformerGradI(DnnBase):
"""
Gradient of inputs Op for cuDNN Spatial Transformer.
"""
__props__ = ()
_cop_num_inputs = 3
_cop_num_outputs = 2
_f16_ok = True
check_input = False
def __init__(self):
DnnBase.__init__(self, ["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")
def make_node(self, img, grid, dy):
context_name = infer_context_name(img, grid, dy)
img = as_gpuarray_variable(gpu_contiguous(img), context_name)
if img.ndim != 4:
raise TypeError('img must have 4 dimensions.')
grid = as_gpuarray_variable(gpu_contiguous(grid), context_name)
if img.ndim != grid.ndim:
raise TypeError('grid should have the same number of dimensions as img')
dy = as_gpuarray_variable(dy, context_name)
if dy.ndim != 4:
raise TypeError('dy must have 4 dimensions.')
dimg = img.type()
dgrid = grid.type()
inputs = [img, grid, dy]
outputs = [dimg, dgrid]
return Apply(self, inputs, outputs)
class GpuDnnTransformerGradT(DnnBase):
"""
Gradient of affine transformations Op for cuDNN Spatial Transformer.
"""
__props__ = ()
_cop_num_inputs = 1
_cop_num_outputs = 1
_f16_ok = True
check_input = False
def __init__(self):
DnnBase.__init__(self, ["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")
def make_node(self, dgrid):
context_name = infer_context_name(dgrid)
dgrid = as_gpuarray_variable(dgrid, context_name)
assert dgrid.dtype in ('float16', 'float32', 'float64')
assert dgrid.ndim == 4
dtheta = GpuArrayType(dtype=dgrid.dtype,
broadcastable=(dgrid.type.ndim - 1) * (False,),
context_name=context_name)()
inputs = [dgrid]
outputs = [dtheta]
return Apply(self, inputs, outputs)
def dnn_spatialtf(img, theta, scale_width=1, scale_height=1):
"""
GPU spatial transformer using cuDNN from NVIDIA.
Parameters
----------
img : tensor
Images to which the transformations will be applied. The implementation
assumes the tensor is in NCHW format, where N is the number of images,
C is the number of color channels, H is the height of the inputs, and
W is width of the inputs.
theta : tensor
Affine transformation tensor containing one affine transformation
matrix per image. ``theta`` is usually generated by the localization
network.
scale_height: float
A float specifying the scaling factor for the height of the output
image. A value of 1 will keep the original height of the input. Values
larger than 1 will upsample the input. Values below 1 will downsample
the input.
scale_width: float
A float specifying the scaling factor for the width of the output
image. A value of 1 will keep the original width of the input. Values
larger than 1 will upsample the input. Values below 1 will downsample
the input.
Returns
-------
out : tensor
Transformed images with width and height properly scaled.
Notes
-----
Currently, cuDNN only supports 2D transformations with 2x3 affine
transformation matrices.
Bilinear interpolation is the only grid sampler method available.
"""
out_dims = (img.shape[0], img.shape[1],
theano.tensor.ceil(img.shape[2] * scale_height),
theano.tensor.ceil(img.shape[3] * scale_width))
out_dims = tuple([as_scalar(v).astype('int64') for v in out_dims])
# Setup spatial transformer
grid = GpuDnnTransformerGrid()(theta, out_dims)
sampler = GpuDnnTransformerSampler()(img, grid)
return sampler
def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if (not isinstance(op, (AbstractConv2d,
AbstractConv2d_gradWeights,
......@@ -3137,6 +3367,7 @@ def local_dnn_convgw_inplace(node, inputs):
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace,
......
......@@ -2296,3 +2296,226 @@ class Cudnn_grouped_conv(Grouped_conv_noOptim):
conv_gradi_op = dnn.GpuDnnConvGradI
flip_filter = False
is_dnn = True
def test_dnn_spatialtf():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
"""
Spatial Transformer implementation using Theano from Lasagne
Original author: skaae (https://github.com/skaae)
"""
def spatialtf_cpu(inp, theta, scale_height, scale_width, border_mode='nearest'):
num_batch, num_channels, height, width = inp.shape
theta = T.reshape(theta, (-1, 2, 3))
# grid of (x_t, y_t, 1), eq (1) in ref [1]
out_height = T.cast(T.ceil(height * scale_height), 'int64')
out_width = T.cast(T.ceil(width * scale_width), 'int64')
grid = _meshgrid(out_height, out_width)
# transform a x (x_t, y_t, 1)^t -> (x_s, y_s)
t_g = T.dot(theta, grid)
x_s = t_g[:, 0]
y_s = t_g[:, 1]
x_s_flat = x_s.flatten()
y_s_flat = y_s.flatten()
# dimshuffle input to (bs, height, width, channels)
input_dim = inp.dimshuffle(0, 2, 3, 1)
input_transformed = _interpolate(
input_dim, x_s_flat, y_s_flat,
out_height, out_width, border_mode)
output = T.reshape(
input_transformed, (num_batch, out_height, out_width, num_channels))
output = output.dimshuffle(0, 3, 1, 2) # dimshuffle to conv format
return output
def _interpolate(im, x, y, out_height, out_width, border_mode):
# *_f are floats
num_batch, height, width, channels = im.shape
height_f = T.cast(height, theano.config.floatX)
width_f = T.cast(width, theano.config.floatX)
# scale coordinates from [-1, 1] to [0, dimension - 1], where dimension
# can be the width or height
x = (x + 1) / 2 * (width_f - 1)
y = (y + 1) / 2 * (height_f - 1)
# obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
# we need those in floatX for interpolation and in int64 for indexing.
x0_f = T.floor(x)
y0_f = T.floor(y)
x1_f = x0_f + 1
y1_f = y0_f + 1
# for indexing, we need to take care of the border mode for outside pixels.
if border_mode == 'nearest':
x0 = T.clip(x0_f, 0, width_f - 1)
x1 = T.clip(x1_f, 0, width_f - 1)
y0 = T.clip(y0_f, 0, height_f - 1)
y1 = T.clip(y1_f, 0, height_f - 1)
elif border_mode == 'mirror':
w = 2 * (width_f - 1)
x0 = T.minimum(x0_f % w, -x0_f % w)
x1 = T.minimum(x1_f % w, -x1_f % w)
h = 2 * (height_f - 1)
y0 = T.minimum(y0_f % h, -y0_f % h)
y1 = T.minimum(y1_f % h, -y1_f % h)
elif border_mode == 'wrap':
x0 = T.mod(x0_f, width_f)
x1 = T.mod(x1_f, width_f)
y0 = T.mod(y0_f, height_f)
y1 = T.mod(y1_f, height_f)
else:
raise ValueError("border_mode must be one of "
"'nearest', 'mirror', 'wrap'")
x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1))
# The input is [num_batch, height, width, channels]. We do the lookup in
# the flattened input, i.e [num_batch*height*width, channels]. We need
# to offset all indices to match the flat version
dim2 = width
dim1 = width * height
base = T.repeat(
T.arange(num_batch, dtype='int64') * dim1, out_height * out_width)
base_y0 = base + y0 * dim2
base_y1 = base + y1 * dim2
idx_a = base_y0 + x0
idx_b = base_y1 + x0
idx_c = base_y0 + x1
idx_d = base_y1 + x1
# use indices to lookup pixels for all samples
im_flat = im.reshape((-1, channels))
Ia = im_flat[idx_a]
Ib = im_flat[idx_b]
Ic = im_flat[idx_c]
Id = im_flat[idx_d]
# calculate interpolated values
wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x')
wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x')
wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x')
wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x')
output = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0)
return output
def _linspace(start, stop, num):
# Theano linspace. Behaves similar to np.linspace
start = T.cast(start, theano.config.floatX)
stop = T.cast(stop, theano.config.floatX)
num = T.cast(num, theano.config.floatX)
step = (stop - start) / (num - 1)
return T.arange(num, dtype=theano.config.floatX) * step + start
def _meshgrid(height, width):
# This function is the grid generator from eq. (1) in reference [1].
# It is equivalent to the following numpy code:
# x_t, y_t = np.meshgrid(np.linspace(-1, 1, width),
# np.linspace(-1, 1, height))
# ones = np.ones(np.prod(x_t.shape))
# grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
# It is implemented in Theano instead to support symbolic grid sizes.
# Note: If the image size is known at layer construction time, we could
# compute the meshgrid offline in numpy instead of doing it dynamically
# in Theano. However, it hardly affected performance when we tried.
x_t = T.dot(T.ones((height, 1)),
_linspace(-1.0, 1.0, width).dimshuffle('x', 0))
y_t = T.dot(_linspace(-1.0, 1.0, height).dimshuffle(0, 'x'),
T.ones((1, width)))
x_t_flat = x_t.reshape((1, -1))
y_t_flat = y_t.reshape((1, -1))
ones = T.ones_like(x_t_flat)
grid = T.concatenate([x_t_flat, y_t_flat, ones], axis=0)
return grid
img_dims = (5, 3, 16, 16)
img = np.random.random(size=img_dims).astype(theano.config.floatX)
scale_height = 0.25
scale_width = 0.75
# Transformation matrix
transform = [[-1, 0, 0],
[0, -1, 0]]
theta = np.asarray(img_dims[0] * [transform], dtype=theano.config.floatX)
# Create symbolic variables for inputs and transformations
t_img = T.tensor4('img')
t_theta = T.tensor3('theta')
st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height, scale_width=scale_width)
st_dnn_func = theano.function([t_img, t_theta], st_dnn)
# Check if function graph contains the spatial transformer's grid and sampler Ops
apply_nodes = st_dnn_func.maker.fgraph.apply_nodes
assert any([isinstance(node.op, dnn.GpuDnnTransformerGrid) for node in apply_nodes])
assert any([isinstance(node.op, dnn.GpuDnnTransformerSampler) for node in apply_nodes])
img_out_gpu = st_dnn_func(img, theta)
img_out_gpu = np.asarray(img_out_gpu)
# Setup CPU Op
st_cpu = spatialtf_cpu(t_img, t_theta, scale_height, scale_width, 'nearest')
st_cpu_func = theano.function([t_img, t_theta], st_cpu, mode=mode_without_gpu)
img_out_cpu = st_cpu_func(img, theta)
atol, rtol = None, None
if theano.config.floatX == 'float16':
# Raise relative error tolerance when using float16
rtol = 5e-2
utt.assert_allclose(img_out_cpu, img_out_gpu, atol=atol, rtol=rtol)
def test_dnn_spatialtf_grad():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
inputs = T.tensor4('inputs')
theta = T.tensor3('theta')
out = dnn.dnn_spatialtf(inputs, theta, scale_height=0.25, scale_width=0.75)
out_mean = T.mean(out)
mean_gi = T.grad(out_mean, [inputs])
mean_gt = T.grad(out_mean, [theta])
f_gi = theano.function([inputs, theta], mean_gi)
assert any([isinstance(node.op, dnn.GpuDnnTransformerGradI)
for node in f_gi.maker.fgraph.apply_nodes])
f_gt = theano.function([inputs, theta], mean_gt)
assert any([isinstance(node.op, dnn.GpuDnnTransformerGradT)
for node in f_gt.maker.fgraph.apply_nodes])
input_dims = (5, 3, 16, 16)
inputs_val = np.random.random(size=input_dims).astype(theano.config.floatX)
# Tensor with transformations
theta_val = np.random.random((input_dims[0], 2, 3)).astype(theano.config.floatX)
# Using smaller values for theta, increases the precision of gradients
# when using lower precision. Tests might fail for lower precision data
# types if the values of theta or the inputs are very high.
theta /= 100
# Check that the gradients are computed
f_gi(inputs_val, theta_val)
f_gt(inputs_val, theta_val)
def grad_functor(inputs, theta):
out = dnn.dnn_spatialtf(inputs, theta)
return out
atol, rtol = None, None
if theano.config.floatX == 'float32':
rtol = 5e-2
elif theano.config.floatX == 'float16':
rtol = 1e-0
utt.verify_grad(grad_functor, [inputs_val, theta_val], mode=mode_with_gpu,
abs_tol=atol, rel_tol=rtol)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论