提交 6d4633be authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3737 from lamblin/gpuarray_abstractconv

Gpuarray abstractconv
......@@ -2406,14 +2406,14 @@ if True:
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(node.op, (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(inp1.type, CudaNdarrayType) or
not isinstance(inp2.type, CudaNdarrayType)):
return None
......
......@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase):
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
差异被折叠。
差异被折叠。
差异被折叠。
......@@ -5,6 +5,7 @@ import warnings
import theano
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log
from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler
......@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs,
get_conv_output_shape)
from theano.tensor.signal.downsample import (DownsampleFactorMax,
MaxPoolGrad, AveragePoolGrad)
from . import pygpu
from .type import get_context, gpu_context_type, list_contexts
from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like)
from .elemwise import GpuElemwise
from .conv import GpuConv
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
......@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(kerns, topgrad)
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op):
"""
This Op builds a pooling descriptor for use in the other
......@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()])
# @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
if node.op.border_mode not in ['full', 'valid']:
return
img, kern = node.inputs
border_mode = node.op.border_mode
subsample = node.op.subsample
direction_hint = node.op.direction_hint
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
# This optimizer is registered in opt.py as part of the meta-optimizer.
# It tries exactly the opposite code path of what local_conv_dnn() uses,
# because for some input/kernel shape configurations, this is faster.
@local_optimizer([GpuConv])
def local_conv_dnn_alternative(node):
if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
border_mode = node.op.border_mode
subsample = node.op.subsample
if border_mode not in ['full', 'valid'] or subsample != (1, 1):
return
img, kern = node.inputs
direction_hint = node.op.direction_hint
if border_mode == 'full':
# for a full convolution, try using the forward pass instead
# of the backward pass wrt. inputs
direction_hint = 'forward!'
elif border_mode == 'valid':
# for a valid convolution, try using the backward pass wrt.
# weights instead of the forward pass and vice versa
if direction_hint == 'bprop weights':
direction_hint = 'forward'
else:
direction_hint = 'bprop weights'
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
if (not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(inp1.type, GpuArrayType) or
not isinstance(inp2.type, GpuArrayType)):
return None
if not dnn_available(inp1.type.context_name):
return None
conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
if node.op.filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if isinstance(node.op, AbstractConv2d):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward!',
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
return [rval]
conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn')
@inplace_allocempty(GpuDnnConv, 2)
......
......@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB
from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor import as_tensor_variable
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tests.breakpoint import PdbBreakpoint
from .type import (GpuArrayType, GpuArrayConstant, get_context,
......@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax)
......@@ -786,77 +791,49 @@ def local_assert(node, context_name):
@register_opt('fast_compile')
@op_lifter([ConvOp])
def local_gpu_conv(node, context_name):
def GpuConvOp_from_ConvOp(op):
logical_img_hw = None
if op.kshp_logical is not None and op.kshp_logical != op.kshp:
return None
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw,
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned,
kshp=op.kshp,
version=op.version,
direction_hint=op.direction_hint,
verbose=op.verbose,
imshp=op.imshp,
nkern=op.nkern,
bsize=op.bsize,
fft_opt=op.fft_opt)
if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] /
float(op.imshp[2])))
def make_graph(img, kern):
buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img)
img = GpuFromHost(context_name)(img)
return ret(img, kern)
return make_graph
return ret
def values_eq_approx(a, b):
"""
This fct is needed to don't have DebugMode raise useless
error due to ronding error.
This happen as We reduce on the two last dimensions, so this
can raise the absolute error if the number of element we
reduce on is significant.
"""
assert a.ndim == 4
atol = None
if a.shape[-1] * a.shape[-2] > 100:
# For float32 the default atol is 1e-5
atol = 3e-5
return GpuArrayType.values_eq_approx(a, b, atol=atol)
img, kern = node.inputs
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None:
return
out = gpu_conv(GpuFromHost(context_name)(img),
GpuFromHost(context_name)(kern))
assert isinstance(out.type, GpuArrayType)
# Make sure to keep the broadcastable pattern of the original
# convolution even if we might gain or lose some due to different
# information at the node level.
out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx
return [out]
# Register this here so that it goes after 'local_gpu_conv'
def local_error_convop(node, context_name):
assert False, """
ConvOp does not work with the gpuarray backend.
Use the new convolution interface to have GPU convolution working:
theano.tensor.nnet.conv2d()
"""
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile')
@op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_lift_abstractconv2d(node, context_name):
inps = list(node.inputs)
inps[0] = as_gpuarray_variable(node.inputs[0],
context_name=context_name)
inps[1] = as_gpuarray_variable(node.inputs[1],
context_name=context_name)
return [node.op(*inps)]
# This will deal with ops that don't have an explicit transfer but
# have one of their inputs on the GPU already and the other not on the
# GPU (to avoid endlessly replacing things).
@register_opt('fast_compile')
@local_optimizer([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_gpu_abstractconv2d(node):
if isinstance(node.op, BaseAbstractConv2d):
if ((isinstance(node.inputs[0].type, GpuArrayType) or
isinstance(node.inputs[1].type, GpuArrayType)) and
not (isinstance(node.inputs[0].type, GpuArrayType) or
isinstance(node.inputs[1].type, GpuArrayType))):
inps = list(node.inputs)
ctx_name = infer_context_name(inps[0], inps[1])
inps[0] = as_gpuarray_variable(inps[0], context_name=ctx_name)
inps[1] = as_gpuarray_variable(inps[1], context_name=ctx_name)
return as_tensor_variable(node.op(*inps))
# Register this here so that it goes after the abstract lifting
register_opt()(conv_groupopt)
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论