提交 6d4633be authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3737 from lamblin/gpuarray_abstractconv

Gpuarray abstractconv
...@@ -2406,14 +2406,14 @@ if True: ...@@ -2406,14 +2406,14 @@ if True:
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node): def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(node.op, (AbstractConv2d, if (not isinstance(node.op, (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))): AbstractConv2d_gradInputs))):
return None return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(inp1.type, CudaNdarrayType) or if (not isinstance(inp1.type, CudaNdarrayType) or
not isinstance(inp2.type, CudaNdarrayType)): not isinstance(inp2.type, CudaNdarrayType)):
return None return None
......
...@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase): ...@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase):
verify_grad=True, mode=mode, device='gpu', verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b, provide_shape=provide_shape, border_mode=b,
filter_flip=flip) filter_flip=flip)
def test_cormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
差异被折叠。
差异被折叠。
差异被折叠。
...@@ -5,6 +5,7 @@ import warnings ...@@ -5,6 +5,7 @@ import warnings
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log from theano.scalar import as_scalar, constant, Log
from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
...@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic ...@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.downsample import ( AbstractConv2d_gradWeights,
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) AbstractConv2d_gradInputs,
get_conv_output_shape)
from theano.tensor.signal.downsample import (DownsampleFactorMax,
MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu
from .type import get_context, gpu_context_type, list_contexts from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like) GpuAllocEmpty, empty_like)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .conv import GpuConv
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
...@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(kerns, topgrad)
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op): class GpuDnnPoolDesc(Op):
""" """
This Op builds a pooling descriptor for use in the other This Op builds a pooling descriptor for use in the other
...@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
# @register_opt('cudnn') # this optimizer is registered in opt.py instead. @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
@local_optimizer([GpuConv]) AbstractConv2d_gradInputs])
def local_conv_dnn(node): def local_abstractconv_cudnn(node):
if isinstance(node.op, GpuConv): if (not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
if not dnn_available(node.outputs[0].type.context_name): AbstractConv2d_gradInputs))):
return return None
if node.op.border_mode not in ['full', 'valid']: inp1 = node.inputs[0]
return inp2 = node.inputs[1]
img, kern = node.inputs
border_mode = node.op.border_mode
subsample = node.op.subsample
direction_hint = node.op.direction_hint
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
# This optimizer is registered in opt.py as part of the meta-optimizer.
# It tries exactly the opposite code path of what local_conv_dnn() uses,
# because for some input/kernel shape configurations, this is faster.
@local_optimizer([GpuConv])
def local_conv_dnn_alternative(node):
if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
border_mode = node.op.border_mode
subsample = node.op.subsample
if border_mode not in ['full', 'valid'] or subsample != (1, 1):
return
img, kern = node.inputs
direction_hint = node.op.direction_hint
if border_mode == 'full':
# for a full convolution, try using the forward pass instead
# of the backward pass wrt. inputs
direction_hint = 'forward!'
elif border_mode == 'valid':
# for a valid convolution, try using the backward pass wrt.
# weights instead of the forward pass and vice versa
if direction_hint == 'bprop weights':
direction_hint = 'forward'
else:
direction_hint = 'bprop weights'
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
if (not isinstance(inp1.type, GpuArrayType) or
not isinstance(inp2.type, GpuArrayType)):
return None
if not dnn_available(inp1.type.context_name):
return None
conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20, if node.op.filter_flip:
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn') conv_mode = 'conv'
else:
conv_mode = 'cross'
if isinstance(node.op, AbstractConv2d):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward!',
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
return [rval]
conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn')
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
......
...@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB ...@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB
from theano.scalar.basic import Scalar, Pow, Cast from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor import as_tensor_variable
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import (GpuArrayType, GpuArrayConstant, get_context, from .type import (GpuArrayType, GpuArrayConstant, get_context,
...@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace) gpugemm_no_inplace)
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
...@@ -786,77 +791,49 @@ def local_assert(node, context_name): ...@@ -786,77 +791,49 @@ def local_assert(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([ConvOp]) @op_lifter([ConvOp])
def local_gpu_conv(node, context_name): def local_error_convop(node, context_name):
def GpuConvOp_from_ConvOp(op): assert False, """
logical_img_hw = None ConvOp does not work with the gpuarray backend.
if op.kshp_logical is not None and op.kshp_logical != op.kshp: Use the new convolution interface to have GPU convolution working:
return None theano.tensor.nnet.conv2d()
"""
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw, # This deals with any abstract convs that have a transfer somewhere
logical_kern_hw=op.kshp_logical, @register_opt('fast_compile')
logical_kern_align_top=op.kshp_logical_top_aligned, @op_lifter([AbstractConv2d,
kshp=op.kshp, AbstractConv2d_gradWeights,
version=op.version, AbstractConv2d_gradInputs])
direction_hint=op.direction_hint, def local_lift_abstractconv2d(node, context_name):
verbose=op.verbose, inps = list(node.inputs)
imshp=op.imshp, inps[0] = as_gpuarray_variable(node.inputs[0],
nkern=op.nkern, context_name=context_name)
bsize=op.bsize, inps[1] = as_gpuarray_variable(node.inputs[1],
fft_opt=op.fft_opt) context_name=context_name)
if op.imshp_logical is not None: return [node.op(*inps)]
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
rstride = int(numpy.ceil(op.imshp_logical[1] / # This will deal with ops that don't have an explicit transfer but
float(op.imshp[1]))) # have one of their inputs on the GPU already and the other not on the
cstride = int(numpy.ceil(op.imshp_logical[2] / # GPU (to avoid endlessly replacing things).
float(op.imshp[2]))) @register_opt('fast_compile')
@local_optimizer([AbstractConv2d,
def make_graph(img, kern): AbstractConv2d_gradWeights,
buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype), AbstractConv2d_gradInputs])
img.shape[0], *op.imshp_logical) def local_gpu_abstractconv2d(node):
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], if isinstance(node.op, BaseAbstractConv2d):
img) if ((isinstance(node.inputs[0].type, GpuArrayType) or
img = GpuFromHost(context_name)(img) isinstance(node.inputs[1].type, GpuArrayType)) and
return ret(img, kern) not (isinstance(node.inputs[0].type, GpuArrayType) or
isinstance(node.inputs[1].type, GpuArrayType))):
return make_graph inps = list(node.inputs)
return ret ctx_name = infer_context_name(inps[0], inps[1])
inps[0] = as_gpuarray_variable(inps[0], context_name=ctx_name)
def values_eq_approx(a, b): inps[1] = as_gpuarray_variable(inps[1], context_name=ctx_name)
""" return as_tensor_variable(node.op(*inps))
This fct is needed to don't have DebugMode raise useless
error due to ronding error. # Register this here so that it goes after the abstract lifting
This happen as We reduce on the two last dimensions, so this
can raise the absolute error if the number of element we
reduce on is significant.
"""
assert a.ndim == 4
atol = None
if a.shape[-1] * a.shape[-2] > 100:
# For float32 the default atol is 1e-5
atol = 3e-5
return GpuArrayType.values_eq_approx(a, b, atol=atol)
img, kern = node.inputs
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None:
return
out = gpu_conv(GpuFromHost(context_name)(img),
GpuFromHost(context_name)(kern))
assert isinstance(out.type, GpuArrayType)
# Make sure to keep the broadcastable pattern of the original
# convolution even if we might gain or lose some due to different
# information at the node level.
out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx
return [out]
# Register this here so that it goes after 'local_gpu_conv'
register_opt()(conv_groupopt) register_opt()(conv_groupopt)
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论