提交 ab206dc1 authored 作者: abergeron's avatar abergeron

Merge pull request #1952 from nouiz/gpu_sum

Move sum to the GPU more frequently, and fix crashes
......@@ -571,7 +571,10 @@ the elements of the shape).
.. code-block:: python
theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, CHECK_INPUT, version=())
The C code works as the ViewOp. Shape_i has the additional ``i`` parameter
that you can use with ``%(i)s``.
In your CHECK_INPUT, you must check that the input have enough ndim to
be able to get the ith shapes.
......@@ -349,13 +349,13 @@ class Shape_i(gof.Op):
version = []
# If any of the c code is unversionned, we have to return ()
# Else, we will return a list of (type name, version) pairs.
for t, (c, v) in sorted(self.c_code_and_version.items(),
key=lambda pair: str(pair[0])):
for t, (c, ci, v) in sorted(self.c_code_and_version.items(),
key=lambda pair: str(pair[0])):
if not v:
warnings.warn("Type %s has C code for Shape_i, but it has "
"no version. You should add a 'version' keyword arg "
"when calling register_shape_i_c_code." % t,
stacklevel=2)
"no version. You should add a 'version' keyword "
"arg when calling register_shape_i_c_code." % t,
stacklevel=2)
return ()
version.append((str(t), v))
......@@ -372,14 +372,8 @@ class Shape_i(gof.Op):
itype = node.inputs[0].type.__class__
if itype in self.c_code_and_version:
sc = """
if (%(i)s>=PyArray_NDIM(%(iname)s)){
PyErr_SetString(PyExc_TypeError, "Number of dimensions lower than expected");
%(fail)s
}
""" % locals()
code, version = self.c_code_and_version[itype]
return sc + code % locals()
code, check_input, version = self.c_code_and_version[itype]
return (check_input + code) % locals()
# Else, no C code
return super(Shape_i, self).c_code(node, name, inames, onames, sub)
......@@ -391,7 +385,7 @@ class Shape_i(gof.Op):
return [None]
def register_shape_i_c_code(typ, code, version=()):
def register_shape_i_c_code(typ, code, check_input, version=()):
""" Tell Shape_i how to generate C code for a Theano Type
:param typ: A Theano type. It must be the Theano class itself and not an
......@@ -401,13 +395,14 @@ def register_shape_i_c_code(typ, code, version=()):
variable names respectively.
:param version: A number indicating the version of the code, for cache.
"""
Shape_i.c_code_and_version[typ] = (code, version)
Shape_i.c_code_and_version[typ] = (code, check_input, version)
# List of Theano Types that one can add an extra dimension and for which
# Scan can deal with.
expandable_types = ()
class FromFunctionOp(gof.Op):
"""
Build a basic Theano Op around a function.
......
......@@ -342,10 +342,10 @@ class ProfileStats(object):
es += [' %2s ']
hs += ['<#call>']
es += [' %5d ']
es += ['%6d ']
hs += ['<#apply>']
es += [' %4d ']
es += [' %4d ']
upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
maxlen = self.line_width - upto_length
......@@ -587,6 +587,7 @@ class ProfileStats(object):
print >> file, ' Time in thunks: %es (%.3f%%)' % (
local_time, 100*local_time / self.fct_call_time)
print >> file, ' Total compile time: %es' % self.compile_time
print >> file, ' Number of Apply nodes: %s' % len(self.apply_time)
print >> file, ' Theano Optimizer time: %es' % self.optimizer_time
print >> file, ' Theano validate time: %es' % self.validate_time
print >> file, (' Theano Linker time (includes C,'
......
......@@ -783,7 +783,7 @@ def pydotprint(fct, outfile=None,
elif var.name or not compact:
g.add_edge(pd.Edge(astr, varstr, label=label))
# else:
#don't add egde here as it is already added from the inputs.
# don't add egde here as it is already added from the inputs.
if cond_highlight:
g.add_subgraph(c1)
......@@ -863,8 +863,8 @@ def pydotprint_variables(vars,
dstr = dstr[:dstr.index('\n')]
varstr = '%s %s' % (dstr, str(var.type))
else:
#a var id is needed as otherwise var with the same type will be
#merged in the graph.
# a var id is needed as otherwise var with the same type will be
# merged in the graph.
varstr = str(var.type)
varstr += ' ' + str(len(var_str))
......@@ -1090,8 +1090,6 @@ def min_informative_str(obj, indent_level=0,
return rval
def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
"""
Returns a string, with no endlines, fully specifying
......@@ -1154,6 +1152,7 @@ def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
return rval
def position_independent_str(obj):
if isinstance(obj, theano.gof.graph.Variable):
rval = 'theano_var'
......
......@@ -86,6 +86,29 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
register_opt(name='gpu_constant_folding')(
tensor.opt.constant_folding)
# This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date.
import theano.tensor.signal.downsample
import theano.sandbox.neighbours
cpu_ops_moved_to_gpu = [
tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
tensor.signal.downsample.DownsampleFactorMax,
tensor.signal.downsample.DownsampleFactorMaxGrad,
theano.sandbox.neighbours.Images2Neibs,
tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias,
tensor.nnet.CrossentropySoftmax1HotWithBiasDx,
tensor.nnet.Softmax, tensor.nnet.SoftmaxWithBias,
tensor.Elemwise, tensor.DimShuffle, tensor.CAReduce,
tensor.elemwise.All, tensor.elemwise.Any,
tensor.elemwise.CAReduceDtype, tensor.elemwise.Sum,
tensor.elemwise.Prod, tensor.elemwise.ProdWithoutZeros,
tensor.Reshape, tensor.Flatten, tensor.Subtensor,
tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
tensor.IncSubtensor, tensor.Shape, tensor.Join,
tensor.Alloc, tensor.Eye]
class InputToGpuOptimizer(Optimizer):
"""
......@@ -617,7 +640,33 @@ def local_gpu_careduce(node):
if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
scal.Maximum, scal.Minimum)):
x, = node.inputs
replace = False
if x.owner and isinstance(x.owner.op, HostFromGpu):
replace = True
elif (all([c != "output" and isinstance(c.op, GpuFromHost)
for c, i in node.outputs[0].clients])
and x.owner and x.owner.op.__class__ in
cpu_ops_moved_to_gpu):
# It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the
# reduction input. It mean we will transfer the
# (bigger) input to the GPU instead of the
# output(smaller) if we stop optimization there. Most
# of the time, we will also move to the GPU what
# created the input of the reduction. In that case, we
# don't introduce a bigger transfer. It is hard to
# know if after all optimization we will do the bigger
# transfer or not. I'm guessing an heuristic to find
# that. I suppose that if the input of the recution is
# generated by an op that we can in some cases move to
# the GPU, that we will move it. If some CPU ops are
# supported only in some cases on the GPU, this will
# move to the GPU the reduction when it wasn't a good
# idea.
replace = True
if replace:
if node.op.axis is None:
reduce_mask = [1] * x.type.ndim
else:
......
......@@ -454,12 +454,22 @@ theano.compile.register_view_op_c_code(
""",
version=1)
theano.compile.register_shape_i_c_code(CudaNdarrayType, """
theano.compile.register_shape_i_c_code(
CudaNdarrayType,
"""
if(!%(oname)s)
%(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0] =
CudaNdarray_HOST_DIMS(%(iname)s)[%(i)s];
""", version=(0,))
""",
"""
if (%(i)s>=CudaNdarray_NDIM(%(iname)s)){
PyErr_SetString(PyExc_TypeError,
"Number of dimensions lower than expected");
%(fail)s
}
""",
version=(1,))
# Register CudaNdarrayType to the DeepCopyOp list of types with c code.
theano.compile.register_deep_copy_op_c_code(
......
......@@ -20,7 +20,7 @@ from theano.gof.python25 import all, any
from theano.tensor.nnet.conv import ConvOp
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu,
host_from_gpu, gpu_from_host, HostFromGpu, GpuSplit,
gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
......@@ -316,6 +316,7 @@ def local_gpuajoin_1(node):
len(node.inputs) == 2):
return [node.inputs[1]]
@register_opt()
@op_lifter([tensor.Split])
def local_gpua_split(node):
......@@ -334,7 +335,7 @@ def local_gpua_incsubtensor(node):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased)
@register_opt()
@op_lifter([tensor.AdvancedIncSubtensor1])
......@@ -371,8 +372,8 @@ def local_gpua_careduce(node):
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
gvar = greduce(x)
#We need to have the make node called, otherwise the mask can
#be None
# We need to have the make node called, otherwise the mask can
# be None
if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
return greduce
else:
......@@ -406,7 +407,7 @@ def local_gpua_careduce(node):
for idx, m in enumerate(new_mask):
if m == 1:
new_axis.append(idx)
new_greduce = GpuCAReduceCuda(
greduce = GpuCAReduceCuda(
node.op.scalar_op,
axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None),
......@@ -415,12 +416,12 @@ def local_gpua_careduce(node):
reshaped_x = x.reshape(tensor.stack(*new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x)
gvar = greduce(gpu_reshaped_x)
#We need to have the make node called, otherwise the mask can
#be None
# We need to have the make node called, otherwise the mask can
# be None
reshaped_gpu_inputs = [gpu_reshaped_x]
if new_greduce.supports_c_code(reshaped_gpu_inputs):
if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu(
new_greduce(gpu_reshaped_x))
greduce(gpu_reshaped_x))
if reduce_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_reduce = reduce_reshaped_x.reshape(
......@@ -497,8 +498,8 @@ def local_gpu_conv(node):
if op.kshp_logical is not None and op.kshp_logical != op.kshp:
return None
#print op.kshp, op.imshp[1:3]
#print op.kshp_logical, logical_img_hw
# print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw,
......@@ -508,12 +509,12 @@ def local_gpu_conv(node):
version=op.version,
verbose=op.verbose,
imshp=op.imshp,
)
)
if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
# this case is not implemented
#return None
# return None
rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] /
......@@ -542,7 +543,7 @@ def local_gpu_conv(node):
assert a.ndim == 4
atol = None
if a.shape[-1] * a.shape[-2] > 100:
#For float32 the default atol is 1e-5
# For float32 the default atol is 1e-5
atol = 3e-5
return GpuArrayType.values_eq_approx(a, b, atol=atol)
......@@ -557,7 +558,7 @@ def local_gpu_conv(node):
out = tensor.patternbroadcast(
host_from_gpu(out),
node.outputs[0].broadcastable)
#op_lifter want the output on the GPU.
# op_lifter want the output on the GPU.
out = gpu_from_host(out)
out.values_eq_approx = values_eq_approx
return [out]
......
......@@ -356,6 +356,17 @@ class G_Join_and_Split(T_Join_and_Split):
self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
self.shared = gpuarray_shared_constructor
def test_gpusplit_opt(self):
rng = numpy.random.RandomState(seed=utt.fetch_seed())
m = self.shared(rng.rand(4, 6).astype(self.floatX))
o = T.Split(2)(m, 0, [2, 2])
f = theano.function([], o, mode=self.mode)
assert any([isinstance(node.op, self.split_op)
for node in f.maker.fgraph.toposort()])
o1, o2 = f()
assert numpy.allclose(o1, m.get_value(borrow=True)[:2])
assert numpy.allclose(o2, m.get_value(borrow=True)[2:])
def test_gpujoin_gpualloc():
a = T.fmatrix('a')
......
......@@ -315,12 +315,22 @@ theano.compile.register_shape_c_code(
""",
version=1)
theano.compile.register_shape_i_c_code(GpuArrayType, """
theano.compile.register_shape_i_c_code(
GpuArrayType,
"""
if(!%(oname)s)
%(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0] =
%(iname)s->ga.dimensions[%(i)s];
""", version=(0,))
""",
"""
if (%(i)s>=%(iname)s->ga.nd){
PyErr_SetString(PyExc_TypeError,
"Number of dimensions lower than expected");
%(fail)s
}
""",
version=(1,))
theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
Py_XDECREF(%(oname)s);
......@@ -331,11 +341,11 @@ theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
theano.compile.register_rebroadcast_c_code(
GpuArrayType,
"""
if(PyGpuArray_DIMS(%(iname)s)[%(axis)s] != 1){
if(%(iname)s->ga.dimensions[%(axis)s] != 1){
PyErr_Format(PyExc_ValueError,
"Dimension %(axis)s in Rebroadcast's input was"
" supposed to be 1 (got %%d instead)",
PyGpuArray_DIMS(%(iname)s)[%(axis)s]);
%(iname)s->ga.dimensions[%(axis)s]);
%(fail)s
}
""",
......
......@@ -754,6 +754,9 @@ class ShapeFeature(object):
def shape_tuple(self, r):
"""Return a tuple of symbolic shape vars for tensor variable r"""
if not hasattr(r, 'ndim'):
# This happen for NoneConst.
return None
return tuple([self.shape_ir(i, r) for i in xrange(r.ndim)])
def default_infer_shape(self, node, i_shapes):
......@@ -782,7 +785,9 @@ class ShapeFeature(object):
# don't make the optimizer merge a zillion ones together
# by always returning the same object to represent 1
return self.lscalar_one
if type(s_i) in (int, long) or isinstance(s_i, numpy.integer):
if (type(s_i) in (int, long) or
isinstance(s_i, numpy.integer) or
(isinstance(s_i, numpy.ndarray) and s_i.ndim == 0)):
# this shape is a constant
assert s_i >= 0
return T.constant(s_i, dtype='int64')
......
......@@ -3246,7 +3246,7 @@ class T_Join_and_Split(unittest.TestCase):
# assert tensor.grad(join(1,a,b), a
utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
eps=1.0e-4, rel_tol=1.0e-3)
eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
def test_join_matrix1_using_vertical_stack(self):
a = self.shared(numpy.array([[1, 2, 3], [4, 5, 6]], dtype=self.floatX))
......@@ -3272,7 +3272,7 @@ class T_Join_and_Split(unittest.TestCase):
self.assertTrue((out == want).all())
utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
eps=1.0e-4, rel_tol=1.0e-3)
eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
def test_join_matrixV(self):
"""variable join axis"""
......@@ -3294,8 +3294,8 @@ class T_Join_and_Split(unittest.TestCase):
got = f(1)
self.assertTrue((got == want).all(), (got, want))
utt.verify_grad(lambda a, b: join(0, a, b), [v, 2 * v])
utt.verify_grad(lambda a, b: join(1, a, b), [v, 2 * v])
utt.verify_grad(lambda a, b: join(0, a, b), [v, 2 * v], mode=self.mode)
utt.verify_grad(lambda a, b: join(1, a, b), [v, 2 * v], mode=self.mode)
def test_vector_len(self):
x = lscalar('x')
......@@ -3344,7 +3344,8 @@ class T_Join_and_Split(unittest.TestCase):
assert [True for node in topo if isinstance(node.op, self.join_op)]
f()
utt.verify_grad((lambda a, b: join(1, a, b)), [a_val, b_val], rng=rng)
utt.verify_grad((lambda a, b: join(1, a, b)), [a_val, b_val], rng=rng,
mode=self.mode)
# Should raise an error if dimension 0 does not match
a.set_value(rng.rand(2, 4, 1).astype(self.floatX))
......@@ -3370,7 +3371,8 @@ class T_Join_and_Split(unittest.TestCase):
assert [True for node in topo if isinstance(node.op, self.join_op)]
f()
utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng)
utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng,
mode=self.mode)
# Should raise an error if b_val.shape[0] is not 1
# We can't set the value|
self.assertRaises(TypeError, b.set_value,
......@@ -3402,7 +3404,8 @@ class T_Join_and_Split(unittest.TestCase):
assert [True for node in topo if isinstance(node.op, self.join_op)]
f()
utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng)
utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng,
mode=self.mode)
def test_broadcastable_single_input_broadcastable_dimension(self):
# Test that all broadcastable flags are preserved by a
......@@ -3422,7 +3425,8 @@ class T_Join_and_Split(unittest.TestCase):
node.op, self.join_op)]
f()
utt.verify_grad((lambda a: join(0, a)), [a_val], rng=rng)
utt.verify_grad((lambda a: join(0, a)), [a_val], rng=rng,
mode=self.mode)
# Should raise an error if length of dimension 0 is not 1
self.assertRaises(TypeError, a.set_value,
rng.rand(2, 4, 1).astype(self.floatX))
......@@ -3458,7 +3462,8 @@ class T_Join_and_Split(unittest.TestCase):
e_val = rng.rand(1, 1, 1, 1, 2, 1).astype(self.floatX)
f(a_val, b_val, c_val, d_val, e_val)
utt.verify_grad((lambda a, b, c, d, e: join(0, a, b, c, d, e)),
[a_val, b_val, c_val, d_val, e_val], rng=rng)
[a_val, b_val, c_val, d_val, e_val], rng=rng,
mode=self.mode)
# Should raise an error if length of dimension 0 is not 1
bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX)
self.assertRaises(TypeError, f, bad_val, b_val, c_val, d_val, e_val)
......
......@@ -646,7 +646,14 @@ theano.compile.register_shape_i_c_code(
%(oname)s=(PyArrayObject*)PyArray_EMPTY(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0]=PyArray_DIMS(%(iname)s)[%(i)s];
""",
version=2)
"""
if (%(i)s>=PyArray_NDIM(%(iname)s)){
PyErr_SetString(PyExc_TypeError,
"Number of dimensions lower than expected");
%(fail)s
}
""",
version=3)
# Register TensorType C code for DeepCopyOp
theano.compile.register_deep_copy_op_c_code(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论