提交 5c9e0b31 authored 作者: abergeron's avatar abergeron

Merge pull request #2298 from nouiz/convdnn_broadcast

Fix broadcast pattern of dnn conv op.
...@@ -342,8 +342,18 @@ def get_c_extract(r, name, sub): ...@@ -342,8 +342,18 @@ def get_c_extract(r, name, sub):
"""Wrapper around c_extract that initializes py_name from storage.""" """Wrapper around c_extract that initializes py_name from storage."""
if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
r.clients]): r.clients]):
# check_broadcast is just an hack to easily remove just the
c_extract = r.type.c_extract(name, sub, True) # broadcast check on the old GPU back-end. THis check isn't
# done in the new GPU back-end or on the CPU.
if hasattr(c.op, 'check_broadcast'):
try:
c_extract = r.type.c_extract(
name, sub, True,
check_broadcast=c.op.check_broadcast)
except TypeError, e:
c_extract = r.type.c_extract(name, sub, True)
else:
c_extract = r.type.c_extract(name, sub, True)
else: else:
c_extract = r.type.c_extract(name, sub, False) c_extract = r.type.c_extract(name, sub, False)
......
...@@ -298,6 +298,8 @@ class GpuDimShuffle(GpuOp): ...@@ -298,6 +298,8 @@ class GpuDimShuffle(GpuOp):
""" """
Implement DimShuffle on the gpu. Implement DimShuffle on the gpu.
""" """
check_broadcast = False
def __init__(self, input_broadcastable, new_order): def __init__(self, input_broadcastable, new_order):
input_broadcastable = tuple(input_broadcastable) input_broadcastable = tuple(input_broadcastable)
self.input_broadcastable = input_broadcastable self.input_broadcastable = input_broadcastable
...@@ -2355,6 +2357,8 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2355,6 +2357,8 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
""" """
Implement subtensor on the gpu. Implement subtensor on the gpu.
""" """
check_broadcast = False
# __hash__, __eq__, __str__ come from tensor.Subtensor # __hash__, __eq__, __str__ come from tensor.Subtensor
def make_node(self, x, *inputs): def make_node(self, x, *inputs):
assert isinstance(x.type, CudaNdarrayType) assert isinstance(x.type, CudaNdarrayType)
...@@ -3352,6 +3356,7 @@ class GpuContiguous(GpuOp): ...@@ -3352,6 +3356,7 @@ class GpuContiguous(GpuOp):
not already c contiguous. not already c contiguous.
""" """
view_map = {0: [0]} view_map = {0: [0]}
check_input = False
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
......
...@@ -513,8 +513,9 @@ class BaseGpuCorrMM(GpuOp): ...@@ -513,8 +513,9 @@ class BaseGpuCorrMM(GpuOp):
integers integers
:param subsample: perform subsampling of the output (default: (1, 1)) :param subsample: perform subsampling of the output (default: (1, 1))
:param pad: *deprecated*, now you should always use border_mode :param pad: *deprecated*, now you should always use border_mode
""" """
check_broadcast = False
def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)): def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
if pad != (0, 0): if pad != (0, 0):
...@@ -1498,6 +1499,8 @@ class GpuConv(GpuOp): ...@@ -1498,6 +1499,8 @@ class GpuConv(GpuOp):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
""" """
check_broadcast = False
@staticmethod @staticmethod
def logical_output_shape_2d(imshp, kshp, mode): def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid': if mode == 'valid':
......
...@@ -7,7 +7,7 @@ from theano.gof.type import CDataType ...@@ -7,7 +7,7 @@ from theano.gof.type import CDataType
from theano.compat import PY3 from theano.compat import PY3
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import (GpuOp, cuda_available) from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous, HostFromGpu) gpu_contiguous, HostFromGpu)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
...@@ -244,23 +244,7 @@ class GpuDnnConvDesc(GpuOp): ...@@ -244,23 +244,7 @@ class GpuDnnConvDesc(GpuOp):
class GpuDnnConvBase(DnnBase): class GpuDnnConvBase(DnnBase):
__props__ = () __props__ = ()
check_broadcast = False
def make_node(self, img, kern, desc):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern, desc],
[CudaNdarrayType(broadcastable)()])
def c_support_code_struct(self, node, struct_id): def c_support_code_struct(self, node, struct_id):
return """ return """
...@@ -417,6 +401,24 @@ class GpuDnnConv(GpuDnnConvBase): ...@@ -417,6 +401,24 @@ class GpuDnnConv(GpuDnnConvBase):
conv_op = 'cudnnConvolutionForward' conv_op = 'cudnnConvolutionForward'
path_flag = 'CUDNN_CONVOLUTION_FWD' path_flag = 'CUDNN_CONVOLUTION_FWD'
def make_node(self, img, kern, desc):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern, desc],
[CudaNdarrayType(broadcastable)()])
def grad(self, inp, grads): def grad(self, inp, grads):
img, kerns, desc = inp img, kerns, desc = inp
top, = grads top, = grads
...@@ -464,6 +466,24 @@ class GpuDnnConvGradW(GpuDnnConvBase): ...@@ -464,6 +466,24 @@ class GpuDnnConvGradW(GpuDnnConvBase):
# not connected to desc # not connected to desc
return [[1], [1], [0]] return [[1], [1], [0]]
def make_node(self, img, topgrad, desc):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = [topgrad.type.broadcastable[1],
img.type.broadcastable[1],
False, False]
return Apply(self, [img, topgrad, desc],
[CudaNdarrayType(broadcastable)()])
class GpuDnnConvGradI(GpuDnnConvBase): class GpuDnnConvGradI(GpuDnnConvBase):
""" """
...@@ -496,6 +516,24 @@ class GpuDnnConvGradI(GpuDnnConvBase): ...@@ -496,6 +516,24 @@ class GpuDnnConvGradI(GpuDnnConvBase):
# not connected to desc # not connected to desc
return [[1], [1], [0]] return [[1], [1], [0]]
def make_node(self, kern, topgrad, desc):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1],
False, False]
return Apply(self, [kern, topgrad, desc],
[CudaNdarrayType(broadcastable)()])
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None): conv_mode='conv', direction_hint=None):
......
...@@ -35,8 +35,6 @@ if theano.config.mode == 'FAST_COMPILE': ...@@ -35,8 +35,6 @@ if theano.config.mode == 'FAST_COMPILE':
else: else:
theano_mode = theano.compile.mode.get_default_mode().including('gpu') theano_mode = theano.compile.mode.get_default_mode().including('gpu')
cuda_tensor4 = cuda.CudaNdarrayType([False] * 4)
device_id = theano.sandbox.cuda.use.device_number device_id = theano.sandbox.cuda.use.device_number
if device_id is None: if device_id is None:
cuda.shared_constructor(numpy.zeros(2, dtype='float32')) cuda.shared_constructor(numpy.zeros(2, dtype='float32'))
...@@ -189,13 +187,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -189,13 +187,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t0 = time.time() t0 = time.time()
cpuval = py_conv(npy_img, npy_kern, mode, subsample) cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time() t1 = time.time()
i = cuda_tensor4() i = cuda.CudaNdarrayType(
k = cuda_tensor4() broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample, subsample=subsample,
version=version, version=version,
verbose=verbose, verbose=verbose,
kshp=compile_kshp)(i, k) kshp=compile_kshp)(i, k)
assert [(sh == 1) is br for
sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])]
f = theano.function([i, k], op, mode=theano_mode) f = theano.function([i, k], op, mode=theano_mode)
if cls is not None: if cls is not None:
assert any([isinstance(node.op, cls) assert any([isinstance(node.op, cls)
...@@ -905,8 +907,10 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy, ...@@ -905,8 +907,10 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4() i = cuda.CudaNdarrayType(
k = cuda_tensor4() broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
if direction == 'fprop': if direction == 'fprop':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
...@@ -971,8 +975,10 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ...@@ -971,8 +975,10 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4() i = cuda.CudaNdarrayType(
k = cuda_tensor4() broadcastable=[sh == 1 for sh in npy_img.shape])()
k = cuda.CudaNdarrayType(
broadcastable=[sh == 1 for sh in npy_kern.shape])()
# TODO: also test custom pad values # TODO: also test custom pad values
corr_op = op(mode, subsample)(i, k) corr_op = op(mode, subsample)(i, k)
...@@ -1009,9 +1015,12 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ...@@ -1009,9 +1015,12 @@ def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
allvals = f(npy_img, npy_kern) allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2], for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight', outputs[::2], outputs[1::2],
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): ('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]
assert_allclose(a, b, rtol=1e-4) assert_allclose(a, b, rtol=1e-4)
......
...@@ -7,7 +7,6 @@ import warnings ...@@ -7,7 +7,6 @@ import warnings
import numpy import numpy
import theano import theano
from theano import config
from theano import Type, Variable from theano import Type, Variable
from theano import tensor, config from theano import tensor, config
from theano import scalar as scal from theano import scalar as scal
...@@ -280,7 +279,8 @@ class CudaNdarrayType(Type): ...@@ -280,7 +279,8 @@ class CudaNdarrayType(Type):
def c_init(self, name, sub): def c_init(self, name, sub):
return "%(name)s = NULL;" % locals() return "%(name)s = NULL;" % locals()
def c_extract(self, name, sub, check_input=True): def c_extract(self, name, sub, check_input=True,
check_broadcast=True):
sio = StringIO() sio = StringIO()
fail = sub['fail'] fail = sub['fail']
nd = self.ndim nd = self.ndim
...@@ -307,7 +307,7 @@ class CudaNdarrayType(Type): ...@@ -307,7 +307,7 @@ class CudaNdarrayType(Type):
//std::cerr << "c_extract " << %(name)s << " nd check passed\\n"; //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
""" % locals() """ % locals()
for i, b in enumerate(self.broadcastable): for i, b in enumerate(self.broadcastable):
if b: if b and check_broadcast:
print >> sio, """ print >> sio, """
if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1) if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
{ {
......
...@@ -547,7 +547,9 @@ get_scalar_constant_value_elemwises = ( ...@@ -547,7 +547,9 @@ get_scalar_constant_value_elemwises = (
def get_scalar_constant_value(orig_v, elemwise=True): def get_scalar_constant_value(orig_v, elemwise=True):
"""return the constant scalar(0-D) value underlying variable `v` """return the constant scalar(0-D) value underlying variable `v`
If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast If v is the output of dimshuffles, fills, allocs, rebroadcasts,
cast, OutputGuard, DeepCopyOp, ScalarFromTensor, ScalarOp,
Elemwise and some pattern with Subtensor,
this function digs through them. this function digs through them.
If `v` is not some view of constant scalar data, then raise a If `v` is not some view of constant scalar data, then raise a
...@@ -587,12 +589,14 @@ def get_scalar_constant_value(orig_v, elemwise=True): ...@@ -587,12 +589,14 @@ def get_scalar_constant_value(orig_v, elemwise=True):
continue continue
elif isinstance(v.owner.op, theano.compile.ops.Shape_i): elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
if isinstance(v.owner.inputs[0], Constant): if isinstance(v.owner.inputs[0], Constant):
return v.owner.inputs[0].data.shape[v.owner.op.i] return numpy.asarray(v.owner.inputs[0].data.shape[v.owner.op.i])
# Don't act as the constant_folding optimization here as this # Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would # fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow. # mess with the stabilization optimization and be too slow.
# We put all the scalar Ops used by get_canonical_form_slice() # We put all the scalar Ops used by get_canonical_form_slice()
# to allow it to determine the broadcast pattern correctly. # to allow it to determine the broadcast pattern correctly.
elif isinstance(v.owner.op, ScalarFromTensor):
return get_scalar_constant_value(v.owner.inputs[0])
elif isinstance(v.owner.op, scal.ScalarOp): elif isinstance(v.owner.op, scal.ScalarOp):
if isinstance(v.owner.op, scal.Second): if isinstance(v.owner.op, scal.Second):
# We don't need both input to be constant for second # We don't need both input to be constant for second
...@@ -3504,9 +3508,9 @@ class Join(Op): ...@@ -3504,9 +3508,9 @@ class Join(Op):
# Join op should get at least one input to join # Join op should get at least one input to join
assert len(ishapes) > 1 assert len(ishapes) > 1
n_dim = len(ishapes[1]) n_dim = len(ishapes[1])
for shape in ishapes[1:]: for shp in ishapes[1:]:
assert shape is not None assert shp is not None
assert len(shape) == n_dim assert len(shp) == n_dim
out_shapes = [] out_shapes = []
for dim in xrange(n_dim): for dim in xrange(n_dim):
...@@ -3522,8 +3526,8 @@ class Join(Op): ...@@ -3522,8 +3526,8 @@ class Join(Op):
t_side = ishapes[1][dim] t_side = ishapes[1][dim]
f_side = ishapes[1][dim] f_side = ishapes[1][dim]
# loop over tensors and sum for the joining dimension # loop over tensors and sum for the joining dimension
for shape in ishapes[2:]: for shp in ishapes[2:]:
t_side = t_side + shape[dim] t_side = t_side + shp[dim]
# return the dimensions found # return the dimensions found
out_shapes.append(switch(eq(dim, node.inputs[0]), out_shapes.append(switch(eq(dim, node.inputs[0]),
t_side, f_side)) t_side, f_side))
......
...@@ -16,7 +16,7 @@ from theano.gof.python25 import maxsize ...@@ -16,7 +16,7 @@ from theano.gof.python25 import maxsize
from theano.printing import pprint from theano.printing import pprint
from theano import scalar as scal from theano import scalar as scal
from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value, from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value,
ARange, TensorType) ARange, TensorType, NotScalarConstantError)
from theano.tensor.elemwise import DimShuffle from theano.tensor.elemwise import DimShuffle
from theano.tensor.type_other import NoneConst, SliceType, make_slice from theano.tensor.type_other import NoneConst, SliceType, make_slice
from theano import config from theano import config
...@@ -470,15 +470,22 @@ class Subtensor(Op): ...@@ -470,15 +470,22 @@ class Subtensor(Op):
broadcastable = [] broadcastable = []
for i, (p, bc) in enumerate(izip(padded, x.type.broadcastable)): for i, (p, bc) in enumerate(izip(padded, x.type.broadcastable)):
if isinstance(p, slice): if isinstance(p, slice):
if bc and p.start in [None, 0]: if bc:
start = p.start start = p.start
if start is None: try:
start = 0 start = get_scalar_constant_value(start)
if (p.stop is None or except NotScalarConstantError:
(isinstance(p.stop, (int, numpy.integer)) and pass
p.stop > start)): if start in [None, 0]:
broadcastable.append(True) start = p.start
continue if start is None:
start = 0
if (p.stop is None or
(isinstance(p.stop, (int, numpy.integer,
numpy.ndarray)) and
p.stop > start)):
broadcastable.append(True)
continue
broadcastable.append(False) broadcastable.append(False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论