提交 c25df18f authored 作者: João Victor Risso's avatar João Victor Risso

Split spatial transformer implementation into grid and sampler Ops

上级 659b7c8f
#section support_code_struct
int
APPLY_SPECIFIC(dnn_sptf_grid)(PyGpuArrayObject * theta,
PyArrayObject * grid_dims,
cudnnSpatialTransformerDescriptor_t desc,
PyGpuArrayObject ** grid,
cudnnHandle_t _handle)
{
PyGpuContextObject * gpu_ctx = theta->context;
size_t gpu_grid_dims[4];
int num_images, num_channels, height, width;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if ( theta->ga.typecode != GA_FLOAT &&
theta->ga.typecode != GA_DOUBLE &&
theta->ga.typecode != GA_HALF )
{
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformerGrid: unsupported data type for theta in spatial transformer." );
return 1;
}
else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGrid: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
return 1;
}
if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformerGrid: grid_dims must have 4 elements." );
return 1;
}
// Obtain grid dimensions
num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 0 ) );
height = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 2 ) );
width = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 3 ) );
gpu_grid_dims[0] = num_images;
gpu_grid_dims[1] = height;
gpu_grid_dims[2] = width;
gpu_grid_dims[3] = 2;
if ( theano_prep_output( grid, 4, gpu_grid_dims, theta->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformerGrid: could not allocate memory for grid of coordinates" );
return 1;
}
cuda_enter( gpu_ctx->ctx );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
PyGpuArray_DEV_DATA( *grid ) );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformerGrid: could not create grid of coordinates: %s",
cudnnGetErrorString( err ) );
return 1;
}
return 0;
}
\ No newline at end of file
...@@ -14,7 +14,7 @@ APPLY_SPECIFIC(ydesc) = NULL; ...@@ -14,7 +14,7 @@ APPLY_SPECIFIC(ydesc) = NULL;
if ( err != CUDNN_STATUS_SUCCESS ) if ( err != CUDNN_STATUS_SUCCESS )
{ {
PyErr_Format( PyExc_MemoryError, PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s", "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor xdesc: %s",
cudnnGetErrorString( err ) ); cudnnGetErrorString( err ) );
FAIL; FAIL;
} }
...@@ -23,7 +23,7 @@ APPLY_SPECIFIC(ydesc) = NULL; ...@@ -23,7 +23,7 @@ APPLY_SPECIFIC(ydesc) = NULL;
if ( err != CUDNN_STATUS_SUCCESS ) if ( err != CUDNN_STATUS_SUCCESS )
{ {
PyErr_Format( PyExc_MemoryError, PyErr_Format( PyExc_MemoryError,
"GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor ydesc: %s", "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor ydesc: %s",
cudnnGetErrorString( err ) ); cudnnGetErrorString( err ) );
FAIL; FAIL;
} }
...@@ -40,22 +40,19 @@ if ( APPLY_SPECIFIC(ydesc) != NULL ) ...@@ -40,22 +40,19 @@ if ( APPLY_SPECIFIC(ydesc) != NULL )
#section support_code_struct #section support_code_struct
int int
APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input, APPLY_SPECIFIC(dnn_sptf_sampler)(PyGpuArrayObject * input,
PyGpuArrayObject * theta, PyGpuArrayObject * grid,
PyArrayObject * grid_dims, cudnnSpatialTransformerDescriptor_t desc,
cudnnSpatialTransformerDescriptor_t desc, PyGpuArrayObject ** output,
PyGpuArrayObject ** output, cudnnHandle_t _handle)
PyGpuArrayObject ** grid,
cudnnHandle_t _handle)
{ {
PyGpuContextObject * gpu_ctx = input->context; PyGpuContextObject * gpu_ctx = input->context;
void * alpha_p; void * alpha_p;
void * beta_p; void * beta_p;
double alpha = 1.0, beta = 0.0; double alpha = 1.0, beta = 0.0;
float af = alpha, bf = beta; float af = alpha, bf = beta;
size_t out_dims[4];
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
int num_images, num_channels, height, width;
size_t gpu_grid_dims[4], out_dims[4];
switch (input->ga.typecode) switch (input->ga.typecode)
{ {
...@@ -77,81 +74,23 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input, ...@@ -77,81 +74,23 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
return 1; return 1;
} }
if ( theta->ga.typecode != GA_FLOAT && out_dims[0] = (size_t) PyGpuArray_DIM(input, 0); // num_images
theta->ga.typecode != GA_DOUBLE && out_dims[1] = (size_t) PyGpuArray_DIM(input, 1); // num_channels
theta->ga.typecode != GA_HALF ) out_dims[2] = (size_t) PyGpuArray_DIM(grid, 1); // grid width
{ out_dims[3] = (size_t) PyGpuArray_DIM(grid, 2); // grid height
PyErr_SetString( PyExc_TypeError,
"GpuDnnTransformer: unsupported data type for theta in spatial transformer." );
return 1;
}
else if ( PyGpuArray_DIM( theta, 1 ) != 2 && PyGpuArray_DIM( theta, 2 ) != 3 )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
PyGpuArray_DIMS( theta )[0], 2, 3, PyGpuArray_DIMS( theta )[0],
PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
return 1;
}
if ( PyArray_NDIM( grid_dims ) != 1 || PyArray_SIZE( grid_dims ) != 4 ) if ( out_dims[0] == 0 || out_dims[1] == 0 || out_dims[2] == 0 || out_dims[3] == 0 )
{
PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: grid_dims must have 4 elements." );
return 1;
}
// Obtain grid dimensions
num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 0 ) );
num_channels = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 1 ) );
height = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 2 ) );
width = (int) *( (npy_int64 *) PyArray_GETPTR1( grid_dims, 3 ) );
gpu_grid_dims[0] = num_images;
gpu_grid_dims[1] = height;
gpu_grid_dims[2] = width;
gpu_grid_dims[3] = 2;
out_dims[0] = num_images;
out_dims[1] = num_channels;
out_dims[2] = height;
out_dims[3] = width;
if ( width == 0 || height == 0 || num_images == 0 )
{ {
PyErr_SetString( PyExc_RuntimeError, PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: grid_dims has a dimension with value zero" ); "GpuDnnTransformerSampler: one of the sampler dimensions is zero" );
return 1; return 1;
}
if ( PyGpuArray_DIM( input, 0 ) != num_images )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: expected batch size %d, got %d.",
num_images, PyGpuArray_DIM( input, 0 ) );
return 1;
}
else if ( PyGpuArray_DIM( input, 1 ) != num_channels )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: expected input to have %d channels, got %d channels.",
num_channels, PyGpuArray_DIM( input, 1 ) );
return 1;
}
if ( theano_prep_output( grid, 4, gpu_grid_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 )
{
PyErr_SetString( PyExc_RuntimeError,
"GpuDnnTransformer: could not allocate memory for grid of coordinates" );
return 1;
} }
if ( theano_prep_output( output, 4, out_dims, input->ga.typecode, if ( theano_prep_output( output, 4, out_dims, input->ga.typecode,
GA_C_ORDER, gpu_ctx ) != 0 ) GA_C_ORDER, gpu_ctx ) != 0 )
{ {
PyErr_SetString( PyExc_MemoryError, PyErr_SetString( PyExc_MemoryError,
"GpuDnnTransformer: could not allocate memory for grid sampler" ); "GpuDnnTransformerSampler: could not allocate memory for grid sampler" );
return 1; return 1;
} }
...@@ -164,29 +103,15 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input, ...@@ -164,29 +103,15 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
cuda_enter( gpu_ctx->ctx ); cuda_enter( gpu_ctx->ctx );
cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
err = cudnnSpatialTfGridGeneratorForward( _handle, desc, PyGpuArray_DEV_DATA( theta ),
PyGpuArray_DEV_DATA( *grid ) );
if ( CUDNN_STATUS_SUCCESS != err )
{
PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: could not create grid of coordinates: %s",
cudnnGetErrorString( err ) );
cuda_exit( gpu_ctx->ctx );
return 1;
}
err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, APPLY_SPECIFIC(xdesc), err = cudnnSpatialTfSamplerForward( _handle, desc, alpha_p, APPLY_SPECIFIC(xdesc),
PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( *grid ), beta_p, PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( grid ), beta_p,
APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) ); APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
cuda_exit( gpu_ctx->ctx ); cuda_exit( gpu_ctx->ctx );
...@@ -194,10 +119,10 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input, ...@@ -194,10 +119,10 @@ APPLY_SPECIFIC(dnn_sptf)(PyGpuArrayObject * input,
if ( CUDNN_STATUS_SUCCESS != err ) if ( CUDNN_STATUS_SUCCESS != err )
{ {
PyErr_Format( PyExc_RuntimeError, PyErr_Format( PyExc_RuntimeError,
"GpuDnnTransformer: could not create grid sampler: %s", "GpuDnnTransformerSampler: could not create grid sampler: %s",
cudnnGetErrorString( err ) ); cudnnGetErrorString( err ) );
return 1; return 1;
} }
return 0; return 0;
} }
\ No newline at end of file
...@@ -2884,34 +2884,22 @@ class GpuDnnTransformerDesc(COp): ...@@ -2884,34 +2884,22 @@ class GpuDnnTransformerDesc(COp):
return (super(GpuDnnTransformerDesc, self).c_code_cache_version(), version()) return (super(GpuDnnTransformerDesc, self).c_code_cache_version(), version())
class GpuDnnTransformer(DnnBase): class GpuDnnTransformerGrid(DnnBase):
"""
Spatial transformer that can be used in spatial transformer networks, it
implements the grid generator and sampler. The localization network can
be built using neural net components of Theano.
"""
__props__ = () __props__ = ()
_cop_num_inputs = 4 _cop_num_inputs = 3
_cop_num_outputs = 2 _cop_num_outputs = 1
_f16_ok = True _f16_ok = True
default_output = 0
def __init__(self): def __init__(self):
DnnBase.__init__(self, ["c_code/dnn_sptf.c"], "APPLY_SPECIFIC(dnn_sptf)") DnnBase.__init__(self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")
def make_node(self, img, theta, desc): def make_node(self, theta, desc):
context_name = infer_context_name(desc) context_name = infer_context_name(desc)
if (not isinstance(desc.type, CDataType) or if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnSpatialTransformerDescriptor_t'): desc.type.ctype != 'cudnnSpatialTransformerDescriptor_t'):
raise ValueError('desc must be cudnnSpatialTransformerDescriptor_t') raise ValueError('desc must be cudnnSpatialTransformerDescriptor_t')
img = gpu_contiguous(as_gpuarray_variable(img, context_name))
if img.type.ndim != 4:
raise TypeError('img must be a 4D tensor')
elif img.dtype not in ('float16', 'float32', 'float64'):
raise TypeError('img type must be floating-point')
theta = gpu_contiguous(as_gpuarray_variable(theta, context_name)) theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
assert theta.dtype in ('float16', 'float32', 'float64') assert theta.dtype in ('float16', 'float32', 'float64')
assert theta.ndim == 3 assert theta.ndim == 3
...@@ -2922,33 +2910,72 @@ class GpuDnnTransformer(DnnBase): ...@@ -2922,33 +2910,72 @@ class GpuDnnTransformer(DnnBase):
assert grid_dims.ndim == 1 assert grid_dims.ndim == 1
# Ensure 64-bit ints are passed to the C code # Ensure 64-bit ints are passed to the C code
grid_dims = theano.tensor.basic.cast(grid_dims, 'int64') grid_dims = theano.tensor.basic.cast(grid_dims, 'int64')
grid = GpuArrayType(dtype=theta.dtype,
broadcastable=(theta.type.ndim + 1) * (False,),
context_name=context_name)()
output = GpuArrayType(dtype=img.dtype, inputs = [theta, grid_dims, desc]
broadcastable=img.type.ndim * (False,), outputs = [grid]
context_name=context_name)() return Apply(self, inputs, outputs)
grid = GpuArrayType(dtype=img.dtype, def grad(self, inputs, grads):
broadcastable=img.type.ndim * (False,), theta, grid_dims, desc = inputs
context_name=context_name)() dgrid = grads[0]
dtheta = GpuDnnTransformerGradT()(dgrid, desc)
return [dtheta, grad_not_implemented(self, 1, grid_dims), DisconnectedType()()]
inputs = [img, theta, grid_dims, desc] def connection_pattern(self, node):
outputs = [output, grid] # not connected to desc
return [[1], [1], [0]]
class GpuDnnTransformerSampler(DnnBase):
__props__ = ()
_cop_num_inputs = 3
_cop_num_outputs = 1
_f16_ok = True
def __init__(self):
DnnBase.__init__(self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)")
def make_node(self, img, grid, desc):
context_name = infer_context_name(desc)
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnSpatialTransformerDescriptor_t'):
raise ValueError('desc must be cudnnSpatialTransformerDescriptor_t')
img = gpu_contiguous(as_gpuarray_variable(img, context_name))
if img.type.ndim != 4:
raise TypeError('img must be a 4D tensor')
elif img.dtype not in ('float16', 'float32', 'float64'):
raise TypeError('img type must be floating-point')
grid = gpu_contiguous(as_gpuarray_variable(grid, context_name))
if grid.type.ndim != 4:
raise TypeError('grid must be a 4D tensor')
elif grid.dtype not in ('float16', 'float32', 'float64'):
raise TypeError('grid type must be floating-point')
out = GpuArrayType(dtype=img.dtype,
broadcastable=img.type.ndim * (False,),
context_name=context_name)()
inputs = [img, grid, desc]
outputs = [out]
return Apply(self, inputs, outputs) return Apply(self, inputs, outputs)
def L_op(self, inputs, outputs, grads): def grad(self, inputs, grads):
img, _, grid_dims, desc = inputs img, grid, desc = inputs
_, grid = outputs
dy = grads[0] dy = grads[0]
dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy, desc) dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy, desc)
dtheta = GpuDnnTransformerGradT()(dgrid, desc) return [dimg, dgrid, DisconnectedType()()]
dgrid_dims = grad_not_implemented(self, grid_dims, 2)
return [dimg, dtheta, dgrid_dims, DisconnectedType()()]
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1, 1], [1, 1], [1, 1], [0, 0]] return [[1], [1], [0]]
class GpuDnnTransformerGradI(DnnBase): class GpuDnnTransformerGradI(DnnBase):
...@@ -3096,8 +3123,9 @@ def dnn_spatialtf(img, theta, scale_width=1, scale_height=1, precision=theano.co ...@@ -3096,8 +3123,9 @@ def dnn_spatialtf(img, theta, scale_width=1, scale_height=1, precision=theano.co
assert theta.ndim == 3 assert theta.ndim == 3
# Setup spatial transformer # Setup spatial transformer
transformer = GpuDnnTransformer()(img, theta, desc) grid = GpuDnnTransformerGrid()(theta, desc)
return transformer sampler = GpuDnnTransformerSampler()(img, grid, desc)
return sampler
@local_optimizer([AbstractConv2d, AbstractConv3d]) @local_optimizer([AbstractConv2d, AbstractConv3d])
......
...@@ -2456,8 +2456,10 @@ def test_dnn_spatialtf(): ...@@ -2456,8 +2456,10 @@ def test_dnn_spatialtf():
st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height, st_dnn = dnn.dnn_spatialtf(t_img, t_theta, scale_height=scale_height,
scale_width=scale_width) scale_width=scale_width)
st_dnn_func = theano.function([t_img, t_theta], st_dnn) st_dnn_func = theano.function([t_img, t_theta], st_dnn)
# Check if function graph contains the spatial transformer Op # Check if function graph contains the spatial transformer's grid and sampler Ops
assert any([isinstance(node.op, dnn.GpuDnnTransformer) assert any([isinstance(node.op, dnn.GpuDnnTransformerGrid)
for node in st_dnn_func.maker.fgraph.toposort()])
assert any([isinstance(node.op, dnn.GpuDnnTransformerSampler)
for node in st_dnn_func.maker.fgraph.toposort()]) for node in st_dnn_func.maker.fgraph.toposort()])
img_out_gpu = st_dnn_func(img, transform) img_out_gpu = st_dnn_func(img, transform)
...@@ -2508,21 +2510,3 @@ def test_dnn_spatialtf_grad(): ...@@ -2508,21 +2510,3 @@ def test_dnn_spatialtf_grad():
assert any([isinstance(node.op, dnn.GpuDnnTransformerGradT) assert any([isinstance(node.op, dnn.GpuDnnTransformerGradT)
for node in grad_fn.maker.fgraph.toposort()]) for node in grad_fn.maker.fgraph.toposort()])
# Verify grad wrt input
def functor_wrt_i(input):
desc = dnn.GpuDnnTransformerDesc(theano.config.floatX)(out_shp)
transformed_input = dnn.GpuDnnTransformer()(input, theta, desc)
grad = T.grad(T.mean(transformed_input), input)
return grad
# Verify grad wrt theta
def functor_wrt_t(theta):
desc = dnn.GpuDnnTransformerDesc(theano.config.floatX)(out_shp)
transformed_input = dnn.GpuDnnTransformer()(img, theta, out, desc)
grad = T.grad(T.mean(transformed_input), theta)
return grad
utt.verify_grad(functor_wrt_i, [img])
utt.verify_grad(functor_wrt_t, [theta])
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论