提交 6d4633be authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3737 from lamblin/gpuarray_abstractconv

Gpuarray abstractconv
...@@ -2406,14 +2406,14 @@ if True: ...@@ -2406,14 +2406,14 @@ if True:
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node): def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(node.op, (AbstractConv2d, if (not isinstance(node.op, (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))): AbstractConv2d_gradInputs))):
return None return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (not isinstance(inp1.type, CudaNdarrayType) or if (not isinstance(inp1.type, CudaNdarrayType) or
not isinstance(inp2.type, CudaNdarrayType)): not isinstance(inp2.type, CudaNdarrayType)):
return None return None
......
...@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase): ...@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase):
verify_grad=True, mode=mode, device='gpu', verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b, provide_shape=provide_shape, border_mode=b,
filter_flip=flip) filter_flip=flip)
def test_cormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
差异被折叠。
差异被折叠。
差异被折叠。
...@@ -5,6 +5,7 @@ import warnings ...@@ -5,6 +5,7 @@ import warnings
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log from theano.scalar import as_scalar, constant, Log
from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
...@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic ...@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.downsample import ( AbstractConv2d_gradWeights,
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) AbstractConv2d_gradInputs,
get_conv_output_shape)
from theano.tensor.signal.downsample import (DownsampleFactorMax,
MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu
from .type import get_context, gpu_context_type, list_contexts from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like) GpuAllocEmpty, empty_like)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .conv import GpuConv
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
...@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(kerns, topgrad)
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op): class GpuDnnPoolDesc(Op):
""" """
This Op builds a pooling descriptor for use in the other This Op builds a pooling descriptor for use in the other
...@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
# @register_opt('cudnn') # this optimizer is registered in opt.py instead. @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
@local_optimizer([GpuConv]) AbstractConv2d_gradInputs])
def local_conv_dnn(node): def local_abstractconv_cudnn(node):
if isinstance(node.op, GpuConv): if (not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
if not dnn_available(node.outputs[0].type.context_name): AbstractConv2d_gradInputs))):
return return None
if node.op.border_mode not in ['full', 'valid']: inp1 = node.inputs[0]
return inp2 = node.inputs[1]
img, kern = node.inputs
border_mode = node.op.border_mode
subsample = node.op.subsample
direction_hint = node.op.direction_hint
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample,
direction_hint=direction_hint)
return [rval]
if (not isinstance(inp1.type, GpuArrayType) or
not isinstance(inp2.type, GpuArrayType)):
return None
# This optimizer is registered in opt.py as part of the meta-optimizer. if not dnn_available(inp1.type.context_name):
# It tries exactly the opposite code path of what local_conv_dnn() uses, return None
# because for some input/kernel shape configurations, this is faster.
@local_optimizer([GpuConv]) if node.op.filter_flip:
def local_conv_dnn_alternative(node): conv_mode = 'conv'
if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
border_mode = node.op.border_mode
subsample = node.op.subsample
if border_mode not in ['full', 'valid'] or subsample != (1, 1):
return
img, kern = node.inputs
direction_hint = node.op.direction_hint
if border_mode == 'full':
# for a full convolution, try using the forward pass instead
# of the backward pass wrt. inputs
direction_hint = 'forward!'
elif border_mode == 'valid':
# for a valid convolution, try using the backward pass wrt.
# weights instead of the forward pass and vice versa
if direction_hint == 'bprop weights':
direction_hint = 'forward'
else: else:
direction_hint = 'bprop weights' conv_mode = 'cross'
rval = dnn_conv(img, kern,
border_mode=border_mode, subsample=subsample, if isinstance(node.op, AbstractConv2d):
direction_hint=direction_hint) rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward!',
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode=conv_mode)
return [rval] return [rval]
conv_groupopt.register('local_abstractconv_cudnn',
conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20, local_abstractconv_cudnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn') 'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn')
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
......
...@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB ...@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB
from theano.scalar.basic import Scalar, Pow, Cast from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor import as_tensor_variable
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import (GpuArrayType, GpuArrayConstant, get_context, from .type import (GpuArrayType, GpuArrayConstant, get_context,
...@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace) gpugemm_no_inplace)
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
...@@ -786,77 +791,49 @@ def local_assert(node, context_name): ...@@ -786,77 +791,49 @@ def local_assert(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([ConvOp]) @op_lifter([ConvOp])
def local_gpu_conv(node, context_name): def local_error_convop(node, context_name):
def GpuConvOp_from_ConvOp(op): assert False, """
logical_img_hw = None ConvOp does not work with the gpuarray backend.
if op.kshp_logical is not None and op.kshp_logical != op.kshp: Use the new convolution interface to have GPU convolution working:
return None theano.tensor.nnet.conv2d()
"""
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw,
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned,
kshp=op.kshp,
version=op.version,
direction_hint=op.direction_hint,
verbose=op.verbose,
imshp=op.imshp,
nkern=op.nkern,
bsize=op.bsize,
fft_opt=op.fft_opt)
if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] /
float(op.imshp[2])))
def make_graph(img, kern):
buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img)
img = GpuFromHost(context_name)(img)
return ret(img, kern)
return make_graph
return ret
def values_eq_approx(a, b):
"""
This fct is needed to don't have DebugMode raise useless
error due to ronding error.
This happen as We reduce on the two last dimensions, so this # This deals with any abstract convs that have a transfer somewhere
can raise the absolute error if the number of element we @register_opt('fast_compile')
reduce on is significant. @op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights,
""" AbstractConv2d_gradInputs])
assert a.ndim == 4 def local_lift_abstractconv2d(node, context_name):
atol = None inps = list(node.inputs)
if a.shape[-1] * a.shape[-2] > 100: inps[0] = as_gpuarray_variable(node.inputs[0],
# For float32 the default atol is 1e-5 context_name=context_name)
atol = 3e-5 inps[1] = as_gpuarray_variable(node.inputs[1],
return GpuArrayType.values_eq_approx(a, b, atol=atol) context_name=context_name)
return [node.op(*inps)]
img, kern = node.inputs
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None: # This will deal with ops that don't have an explicit transfer but
return # have one of their inputs on the GPU already and the other not on the
out = gpu_conv(GpuFromHost(context_name)(img), # GPU (to avoid endlessly replacing things).
GpuFromHost(context_name)(kern)) @register_opt('fast_compile')
assert isinstance(out.type, GpuArrayType) @local_optimizer([AbstractConv2d,
# Make sure to keep the broadcastable pattern of the original AbstractConv2d_gradWeights,
# convolution even if we might gain or lose some due to different AbstractConv2d_gradInputs])
# information at the node level. def local_gpu_abstractconv2d(node):
out = tensor.patternbroadcast(out, node.outputs[0].broadcastable) if isinstance(node.op, BaseAbstractConv2d):
out.values_eq_approx = values_eq_approx if ((isinstance(node.inputs[0].type, GpuArrayType) or
return [out] isinstance(node.inputs[1].type, GpuArrayType)) and
not (isinstance(node.inputs[0].type, GpuArrayType) or
# Register this here so that it goes after 'local_gpu_conv' isinstance(node.inputs[1].type, GpuArrayType))):
inps = list(node.inputs)
ctx_name = infer_context_name(inps[0], inps[1])
inps[0] = as_gpuarray_variable(inps[0], context_name=ctx_name)
inps[1] = as_gpuarray_variable(inps[1], context_name=ctx_name)
return as_tensor_variable(node.op(*inps))
# Register this here so that it goes after the abstract lifting
register_opt()(conv_groupopt) register_opt()(conv_groupopt)
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论