提交 5c9e0b31 authored 作者: abergeron's avatar abergeron

Merge pull request #2298 from nouiz/convdnn_broadcast

Fix broadcast pattern of dnn conv op.
......@@ -342,8 +342,18 @@ def get_c_extract(r, name, sub):
"""Wrapper around c_extract that initializes py_name from storage."""
if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
r.clients]):
c_extract = r.type.c_extract(name, sub, True)
# check_broadcast is just an hack to easily remove just the
# broadcast check on the old GPU back-end. THis check isn't
# done in the new GPU back-end or on the CPU.
if hasattr(c.op, 'check_broadcast'):
try:
c_extract = r.type.c_extract(
name, sub, True,
check_broadcast=c.op.check_broadcast)
except TypeError, e:
c_extract = r.type.c_extract(name, sub, True)
else:
c_extract = r.type.c_extract(name, sub, True)
else:
c_extract = r.type.c_extract(name, sub, False)
......
......@@ -298,6 +298,8 @@ class GpuDimShuffle(GpuOp):
"""
Implement DimShuffle on the gpu.
"""
check_broadcast = False
def __init__(self, input_broadcastable, new_order):
input_broadcastable = tuple(input_broadcastable)
self.input_broadcastable = input_broadcastable
......@@ -2355,6 +2357,8 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
"""
Implement subtensor on the gpu.
"""
check_broadcast = False
# __hash__, __eq__, __str__ come from tensor.Subtensor
def make_node(self, x, *inputs):
assert isinstance(x.type, CudaNdarrayType)
......@@ -3352,6 +3356,7 @@ class GpuContiguous(GpuOp):
not already c contiguous.
"""
view_map = {0: [0]}
check_input = False
def __eq__(self, other):
return type(self) == type(other)
......
......@@ -513,8 +513,9 @@ class BaseGpuCorrMM(GpuOp):
integers
:param subsample: perform subsampling of the output (default: (1, 1))
:param pad: *deprecated*, now you should always use border_mode
"""
check_broadcast = False
def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
if pad != (0, 0):
......@@ -1498,6 +1499,8 @@ class GpuConv(GpuOp):
"""
Implement the batched and stacked 2d convolution on the gpu.
"""
check_broadcast = False
@staticmethod
def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid':
......
......@@ -7,7 +7,7 @@ from theano.gof.type import CDataType
from theano.compat import PY3
from theano.tensor.nnet import SoftmaxGrad
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import (GpuOp, cuda_available)
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous, HostFromGpu)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
......@@ -244,23 +244,7 @@ class GpuDnnConvDesc(GpuOp):
class GpuDnnConvBase(DnnBase):
__props__ = ()
def make_node(self, img, kern, desc):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern, desc],
[CudaNdarrayType(broadcastable)()])
check_broadcast = False
def c_support_code_struct(self, node, struct_id):
return """
......@@ -417,6 +401,24 @@ class GpuDnnConv(GpuDnnConvBase):
conv_op = 'cudnnConvolutionForward'
path_flag = 'CUDNN_CONVOLUTION_FWD'
def make_node(self, img, kern, desc):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern, desc],
[CudaNdarrayType(broadcastable)()])
def grad(self, inp, grads):
img, kerns, desc = inp
top, = grads
......@@ -464,6 +466,24 @@ class GpuDnnConvGradW(GpuDnnConvBase):
# not connected to desc
return [[1], [1], [0]]
def make_node(self, img, topgrad, desc):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = [topgrad.type.broadcastable[1],
img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad, desc],
[CudaNdarrayType(broadcastable)()])
class GpuDnnConvGradI(GpuDnnConvBase):
"""
......@@ -496,6 +516,24 @@ class GpuDnnConvGradI(GpuDnnConvBase):
# not connected to desc
return [[1], [1], [0]]
def make_node(self, kern, topgrad, desc):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1],
False, False]
return Apply(self, [kern, topgrad, desc],
[CudaNdarrayType(broadcastable)()])
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None):
......
......@@ -35,8 +35,6 @@ if theano.config.mode == 'FAST_COMPILE':
else:
theano_mode = theano.compile.mode.get_default_mode().including('gpu')
cuda_tensor4 = cuda.CudaNdarrayType([False] * 4)
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
cuda.shared_constructor(numpy.zeros(2, dtype='float32'))
......@@ -189,13 +187,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t0 = time.time()
cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time()
i = cuda_tensor4()
k = cuda_tensor4()
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample,
version=version,
verbose=verbose,
kshp=compile_kshp)(i, k)
assert [(sh == 1) is br for
sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])]
f = theano.function([i, k], op, mode=theano_mode)
if cls is not None:
assert any([isinstance(node.op, cls)
......@@ -905,8 +907,10 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
if direction == 'fprop':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
......@@ -971,8 +975,10 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
i = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
# TODO: also test custom pad values
corr_op = op(mode, subsample)(i, k)
......@@ -1009,9 +1015,12 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2],
outputs[::2], outputs[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]
assert_allclose(a, b, rtol=1e-4)
......
......@@ -7,7 +7,6 @@ import warnings
import numpy
import theano
from theano import config
from theano import Type, Variable
from theano import tensor, config
from theano import scalar as scal
......@@ -280,7 +279,8 @@ class CudaNdarrayType(Type):
def c_init(self, name, sub):
return "%(name)s = NULL;" % locals()
def c_extract(self, name, sub, check_input=True):
def c_extract(self, name, sub, check_input=True,
check_broadcast=True):
sio = StringIO()
fail = sub['fail']
nd = self.ndim
......@@ -307,7 +307,7 @@ class CudaNdarrayType(Type):
//std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
""" % locals()
for i, b in enumerate(self.broadcastable):
if b:
if b and check_broadcast:
print >> sio, """
if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
{
......
......@@ -547,7 +547,9 @@ get_scalar_constant_value_elemwises = (
def get_scalar_constant_value(orig_v, elemwise=True):
"""return the constant scalar(0-D) value underlying variable `v`
If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
If v is the output of dimshuffles, fills, allocs, rebroadcasts,
cast, OutputGuard, DeepCopyOp, ScalarFromTensor, ScalarOp,
Elemwise and some pattern with Subtensor,
this function digs through them.
If `v` is not some view of constant scalar data, then raise a
......@@ -587,12 +589,14 @@ def get_scalar_constant_value(orig_v, elemwise=True):
continue
elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
if isinstance(v.owner.inputs[0], Constant):
return v.owner.inputs[0].data.shape[v.owner.op.i]
return numpy.asarray(v.owner.inputs[0].data.shape[v.owner.op.i])
# Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow.
# We put all the scalar Ops used by get_canonical_form_slice()
# to allow it to determine the broadcast pattern correctly.
elif isinstance(v.owner.op, ScalarFromTensor):
return get_scalar_constant_value(v.owner.inputs[0])
elif isinstance(v.owner.op, scal.ScalarOp):
if isinstance(v.owner.op, scal.Second):
# We don't need both input to be constant for second
......@@ -3504,9 +3508,9 @@ class Join(Op):
# Join op should get at least one input to join
assert len(ishapes) > 1
n_dim = len(ishapes[1])
for shape in ishapes[1:]:
assert shape is not None
assert len(shape) == n_dim
for shp in ishapes[1:]:
assert shp is not None
assert len(shp) == n_dim
out_shapes = []
for dim in xrange(n_dim):
......@@ -3522,8 +3526,8 @@ class Join(Op):
t_side = ishapes[1][dim]
f_side = ishapes[1][dim]
# loop over tensors and sum for the joining dimension
for shape in ishapes[2:]:
t_side = t_side + shape[dim]
for shp in ishapes[2:]:
t_side = t_side + shp[dim]
# return the dimensions found
out_shapes.append(switch(eq(dim, node.inputs[0]),
t_side, f_side))
......
......@@ -16,7 +16,7 @@ from theano.gof.python25 import maxsize
from theano.printing import pprint
from theano import scalar as scal
from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value,
ARange, TensorType)
ARange, TensorType, NotScalarConstantError)
from theano.tensor.elemwise import DimShuffle
from theano.tensor.type_other import NoneConst, SliceType, make_slice
from theano import config
......@@ -470,15 +470,22 @@ class Subtensor(Op):
broadcastable = []
for i, (p, bc) in enumerate(izip(padded, x.type.broadcastable)):
if isinstance(p, slice):
if bc and p.start in [None, 0]:
if bc:
start = p.start
if start is None:
start = 0
if (p.stop is None or
(isinstance(p.stop, (int, numpy.integer)) and
p.stop > start)):
broadcastable.append(True)
continue
try:
start = get_scalar_constant_value(start)
except NotScalarConstantError:
pass
if start in [None, 0]:
start = p.start
if start is None:
start = 0
if (p.stop is None or
(isinstance(p.stop, (int, numpy.integer,
numpy.ndarray)) and
p.stop > start)):
broadcastable.append(True)
continue
broadcastable.append(False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论