提交 4736c9b3 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2665 from ballasn/conv2d_interface

New conv2d interface (work in progress)
......@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.opt import register_specialize_device
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu,
......@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.tensor.nnet.abstract_conv2d import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.opt import register_specialize_device
def dnn_available():
if dnn_available.avail is None:
......@@ -1276,6 +1285,58 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
return GpuDnnConv3d(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad,
kerns_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns_shp)
out = gpu_alloc_empty(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad,
img_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to input using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = theano.tensor.as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img_shp, kerns.shape)
out = gpu_alloc_empty(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(GpuOp):
"""
This Op builds a pooling descriptor for use in the other pooling operations.
......@@ -2383,3 +2444,47 @@ if True:
gpu_contiguous(ins[1])
)
return [out.dimshuffle(0, 1)]
### AbstractConv Optimizations
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((not isinstance(node.op, AbstractConv2d) or
not isinstance(node.op, AbstractConv2d_gradWeights) or
not isinstance(node.op, AbstractConv2d_gradInputs))):
return None
if not isinstance(inp1.type, CudaNdarrayType) or \
not isinstance(inp2.type, CudaNdarrayType):
return None
if not dnn_available():
return None
if node.op.filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if (isinstance(node.op, AbstractConv2d)):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward',
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradWeights)):
shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradInputs)):
shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
......@@ -75,6 +75,12 @@ from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D
from theano.tests.breakpoint import PdbBreakpoint
from theano.tensor.nnet.abstract_conv2d import (BaseAbstractConv2d, AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.opt import register_specialize_device
try:
# We need to be able to import this file even if cuda isn't avail.
from theano.sandbox.cuda import device_properties
......@@ -2622,3 +2628,179 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
import theano.sandbox.cuda.extra_ops
### Move to Gpu optimization
@local_optimizer([gpu_from_host,
AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_conv2d_gpu_conv(node):
"""
gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, BaseAbstractConv2d):
conv = host_input.owner.op
inps = list(host_input.owner.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [out]
if isinstance(node.op, BaseAbstractConv2d):
# conv(host_from_gpu) -> host_from_gpu(gpu_conv)
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((isinstance(inp1.type, CudaNdarrayType) and
isinstance(inp2.type, CudaNdarrayType))):
# Both inputs are already directly on the GPU, nothing to do
return
inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
(inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
(inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
inps = list(node.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
register_opt()(local_conv2d_gpu_conv)
### Corrmm opt
@local_optimizer([AbstractConv2d])
def local_abstractconv_gemm(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if (not isinstance(img.type, CudaNdarrayType) or
not isinstance(kern.type, CudaNdarrayType)):
return None
border_mode = node.op.border_mode
subsample = node.op.subsample
if (border_mode == 'full') and (subsample == (1, 1)):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth
# is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1,1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp)):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
if (None not in node.op.imshp[:1]):
# we also know batchsize and input channels
prod1 *= node.op.imshp[0]
prod2 *= node.op.imshp[1]
# compare to decide
if prod1 > prod2:
# (we need to wrap the result in as_cuda_ndarray_variable,
# because we are not allowed to replace a CudaNdarray with
# a DimShuffle instance in a graph optimization)
rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
GpuCorrMM_gradWeights(border_mode, subsample)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))
return [rval]
@local_optimizer([AbstractConv2d_gradWeights])
def local_abstractconv_gradweight_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradWeights):
return None
img, topgrad, shape = node.inputs
if not isinstance(img.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_cuda_ndarray_variable(rval)
return [rval]
@local_optimizer([AbstractConv2d_gradInputs])
def local_abstractconv_gradinputs_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradInputs):
return None
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
# Register GPU convolution implementation
# They are tried in a specific order so we can control
# which ones take precedence over others.
abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
abstractconv_groupopt.__name__ = "gpu_abstractconv_opts"
register_specialize_device(abstractconv_groupopt, 'gpu', 'fast_compile')
# cuDNN is first, but only registered if cuDNN is available.
conv_groupopt.register('local_abstractconv_dnn', dnn.local_abstractconv_cudnn, 20,
'conv_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
# The GEMM-based convolution comes last to catch all remaining cases.
# It can be disabled by excluding 'conv_gemm'.
conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
conv_groupopt.register('local_abstractconv_gradweight_gemm',
local_abstractconv_gradweight_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
conv_groupopt.register('local_abstractconv_gradinputs_gemm',
local_abstractconv_gradinputs_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
差异被折叠。
......@@ -158,9 +158,9 @@ class Conv3D(theano.Op):
vidDur = V_shape[3]
filterDur = W_shape[3]
output_height = T.floor((vidHeight - filterHeight) // dr) + 1
output_width = T.floor((vidWidth - filterWidth) // dc) + 1
output_dur = T.floor((vidDur - filterDur) // dt) + 1
output_height = ((vidHeight - filterHeight) // dr) + 1
output_width = ((vidWidth - filterWidth) // dc) + 1
output_dur = ((vidDur - filterDur) // dt) + 1
rval = (batch_size, output_height, output_width, output_dur, output_channels)
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论