提交 db2bd471 authored 作者: lamblin's avatar lamblin

Merge pull request #754 from jaberg/ipam2012

Misc for IPAM GSS 2012
......@@ -605,7 +605,7 @@ def guess_n_streams(size, warn=True):
Return a guess at a good number of streams.
:param warn: If True, warn when a guess cannot be made (in which case
we return 30 * 256).
we return 60 * 256).
"""
# TODO: a smart way of choosing the number of streams, see #612.
# Note that this code was moved out of `MRG_RandomStreams` so that it can
......@@ -618,14 +618,22 @@ def guess_n_streams(size, warn=True):
r *= s
if r > 6:
r = r/6 # chosen as fastest for rbm_benchmark
return r
# The purpose of sampling from many streams is to be able to use
# the GPU to its full capacity. It just wastes RAM and stream-initialization time to
# allocate more streams than necessary for the GPU.
# XXX: This number is chosen to be good for 280 and 480 architectures,
# Better would be to use pycuda to query the number of
# processors on the GPU device,
# rather than guessing 60.
return min(r, 60 * 256)
else:
if warn:
warnings.warn((
"MRG_RandomStreams Can't determine #streams from "
"size (%s), guessing 30*256") % str(size),
"size (%s), guessing 60*256") % str(size),
stacklevel=3)
return 30 * 256
return 60 * 256
class MRG_RandomStreams(object):
......
......@@ -1085,9 +1085,9 @@ class Maximum(BinaryScalarOp):
# max is not defined for complex_types
gx, gy = None, None
if x.type in float_types:
gx = eq(maximum(x, y), x) * gz
gx = cast(eq(maximum(x, y), x) * gz, x.type.dtype)
if y.type in float_types:
gy = eq(maximum(x, y), y) * gz
gy = cast(eq(maximum(x, y), y) * gz, y.type.dtype)
return (gx, gy)
maximum = Maximum(upcast_out, name='maximum')
......@@ -1110,9 +1110,9 @@ class Minimum(BinaryScalarOp):
# max is not defined for complex_types
gx, gy = None, None
if x.type in float_types:
gx = eq(minimum(x, y), x) * gz
gx = cast(eq(minimum(x, y), x) * gz, x.type.dtype)
if y.type in float_types:
gy = eq(minimum(x, y), y) * gz
gy = cast(eq(minimum(x, y), y) * gz, y.type.dtype)
return (gx, gy)
minimum = Minimum(upcast_out, name='minimum')
......
......@@ -375,7 +375,7 @@ def default_blas_ldflags():
# options part.
['-L%s' % l for l in blas_info['library_dirs']] +
['-l%s' % l for l in blas_info['libraries']] +
extra)
[])
# ['-I%s' % l for l in blas_info['include_dirs']])
except KeyError:
return "-lblas"
......
......@@ -1594,8 +1594,9 @@ class Sum(CAReduceDtype):
else:
new_dims.append(i)
i += 1
return Elemwise(scalar.second)(
x, DimShuffle(gz.type.broadcastable, new_dims)(gz)),
ds_op = DimShuffle(gz.type.broadcastable, new_dims)
gx = Elemwise(scalar.second)(x, ds_op(gz).astype(x.dtype))
return [gx]
def R_op(self, inputs, eval_points):
# There is just one element in inputs and eval_points, the axis are
......
......@@ -3136,6 +3136,39 @@ def local_cut_useless_reduce(node):
return [summed]
@register_canonicalize
@gof.local_optimizer([])
def local_sum_broadcastable(node):
"""Remove reduction over broadcastable dimensions"""
if isinstance(node.op, T.CAReduce):
reduced, = node.inputs
odtype = node.outputs[0].dtype
if node.op.axis is None:
if all(reduced.broadcastable):
return [reduced.dimshuffle().astype(odtype)]
else:
axis = list(node.op.axis)
cuttable = [a for a in axis if reduced.broadcastable[a]]
if cuttable:
# -- we can remove some axes of summation,
# which simplifies the codegen for sum, especially on GPU
new_axis = []
pattern = []
ii = 0
for p in range(reduced.ndim):
if p not in cuttable:
if p in axis:
new_axis.append(ii)
pattern.append(p)
ii += 1
new_reduced = reduced.dimshuffle(*pattern)
if new_axis:
new_op = node.op.__class__(axis=new_axis)
return [new_op(new_reduced)]
else:
# -- in this case we can remove the reduction completely
return [new_reduced.astype(odtype)]
@register_specialize
@gof.local_optimizer([])
def local_sum_alloc(node):
......
......@@ -46,6 +46,7 @@ from theano.tensor import (
)
from theano.tensor.elemwise import DimShuffle
from theano.tests import unittest_tools as utt
from theano.compile.mode import optdb
mode_opt = theano.config.mode
if mode_opt == 'FAST_COMPILE':
......@@ -3288,6 +3289,51 @@ class T_local_sum(unittest.TestCase):
finally:
config.on_opt_error = backup
def test_local_sum_broadcast_all_0(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, True, True))()
env = Env([x], [x.sum()])
optimizer.optimize(env)
assert not any([
isinstance(node.op, T.CAReduce)
for node in env.toposort()])
def test_local_sum_broadcast_all_1(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, True))()
env = Env([x], [x.sum(axis=[0, 1])])
optimizer.optimize(env)
assert not any([
isinstance(node.op, T.CAReduce)
for node in env.toposort()])
def test_local_sum_broadcast_some_0(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, False, True))()
env = Env([x], [x.sum(axis=[0, 1])])
optimizer.optimize(env)
order = env.toposort()
assert 1 == sum([isinstance(node.op, T.CAReduce) for node in order])
op = order[-2].op
assert isinstance(op, T.CAReduce)
# -- the leading broadcastable dimension has been dropped
# by the local_sum_broadcastable optimization
# now summation is over the original x's dimension 1.
assert order[-2].inputs[0].ndim == 2, order[-2]
assert op.axis == (0,), op.axis
def test_local_sum_broadcast_some_1(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, False, True))()
env = Env([x], [x.sum(axis=[0, 2])])
optimizer.optimize(env)
order = env.toposort()
assert 0 == sum([isinstance(node.op, T.CAReduce) for node in order])
class T_local_sum_dimshuffle(unittest.TestCase):
def setUp(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论