提交 db2bd471 authored 作者: lamblin's avatar lamblin

Merge pull request #754 from jaberg/ipam2012

Misc for IPAM GSS 2012
...@@ -605,7 +605,7 @@ def guess_n_streams(size, warn=True): ...@@ -605,7 +605,7 @@ def guess_n_streams(size, warn=True):
Return a guess at a good number of streams. Return a guess at a good number of streams.
:param warn: If True, warn when a guess cannot be made (in which case :param warn: If True, warn when a guess cannot be made (in which case
we return 30 * 256). we return 60 * 256).
""" """
# TODO: a smart way of choosing the number of streams, see #612. # TODO: a smart way of choosing the number of streams, see #612.
# Note that this code was moved out of `MRG_RandomStreams` so that it can # Note that this code was moved out of `MRG_RandomStreams` so that it can
...@@ -618,14 +618,22 @@ def guess_n_streams(size, warn=True): ...@@ -618,14 +618,22 @@ def guess_n_streams(size, warn=True):
r *= s r *= s
if r > 6: if r > 6:
r = r/6 # chosen as fastest for rbm_benchmark r = r/6 # chosen as fastest for rbm_benchmark
return r
# The purpose of sampling from many streams is to be able to use
# the GPU to its full capacity. It just wastes RAM and stream-initialization time to
# allocate more streams than necessary for the GPU.
# XXX: This number is chosen to be good for 280 and 480 architectures,
# Better would be to use pycuda to query the number of
# processors on the GPU device,
# rather than guessing 60.
return min(r, 60 * 256)
else: else:
if warn: if warn:
warnings.warn(( warnings.warn((
"MRG_RandomStreams Can't determine #streams from " "MRG_RandomStreams Can't determine #streams from "
"size (%s), guessing 30*256") % str(size), "size (%s), guessing 60*256") % str(size),
stacklevel=3) stacklevel=3)
return 30 * 256 return 60 * 256
class MRG_RandomStreams(object): class MRG_RandomStreams(object):
......
...@@ -1085,9 +1085,9 @@ class Maximum(BinaryScalarOp): ...@@ -1085,9 +1085,9 @@ class Maximum(BinaryScalarOp):
# max is not defined for complex_types # max is not defined for complex_types
gx, gy = None, None gx, gy = None, None
if x.type in float_types: if x.type in float_types:
gx = eq(maximum(x, y), x) * gz gx = cast(eq(maximum(x, y), x) * gz, x.type.dtype)
if y.type in float_types: if y.type in float_types:
gy = eq(maximum(x, y), y) * gz gy = cast(eq(maximum(x, y), y) * gz, y.type.dtype)
return (gx, gy) return (gx, gy)
maximum = Maximum(upcast_out, name='maximum') maximum = Maximum(upcast_out, name='maximum')
...@@ -1110,9 +1110,9 @@ class Minimum(BinaryScalarOp): ...@@ -1110,9 +1110,9 @@ class Minimum(BinaryScalarOp):
# max is not defined for complex_types # max is not defined for complex_types
gx, gy = None, None gx, gy = None, None
if x.type in float_types: if x.type in float_types:
gx = eq(minimum(x, y), x) * gz gx = cast(eq(minimum(x, y), x) * gz, x.type.dtype)
if y.type in float_types: if y.type in float_types:
gy = eq(minimum(x, y), y) * gz gy = cast(eq(minimum(x, y), y) * gz, y.type.dtype)
return (gx, gy) return (gx, gy)
minimum = Minimum(upcast_out, name='minimum') minimum = Minimum(upcast_out, name='minimum')
......
...@@ -375,7 +375,7 @@ def default_blas_ldflags(): ...@@ -375,7 +375,7 @@ def default_blas_ldflags():
# options part. # options part.
['-L%s' % l for l in blas_info['library_dirs']] + ['-L%s' % l for l in blas_info['library_dirs']] +
['-l%s' % l for l in blas_info['libraries']] + ['-l%s' % l for l in blas_info['libraries']] +
extra) [])
# ['-I%s' % l for l in blas_info['include_dirs']]) # ['-I%s' % l for l in blas_info['include_dirs']])
except KeyError: except KeyError:
return "-lblas" return "-lblas"
......
...@@ -1594,8 +1594,9 @@ class Sum(CAReduceDtype): ...@@ -1594,8 +1594,9 @@ class Sum(CAReduceDtype):
else: else:
new_dims.append(i) new_dims.append(i)
i += 1 i += 1
return Elemwise(scalar.second)( ds_op = DimShuffle(gz.type.broadcastable, new_dims)
x, DimShuffle(gz.type.broadcastable, new_dims)(gz)), gx = Elemwise(scalar.second)(x, ds_op(gz).astype(x.dtype))
return [gx]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
# There is just one element in inputs and eval_points, the axis are # There is just one element in inputs and eval_points, the axis are
......
...@@ -3136,6 +3136,39 @@ def local_cut_useless_reduce(node): ...@@ -3136,6 +3136,39 @@ def local_cut_useless_reduce(node):
return [summed] return [summed]
@register_canonicalize
@gof.local_optimizer([])
def local_sum_broadcastable(node):
"""Remove reduction over broadcastable dimensions"""
if isinstance(node.op, T.CAReduce):
reduced, = node.inputs
odtype = node.outputs[0].dtype
if node.op.axis is None:
if all(reduced.broadcastable):
return [reduced.dimshuffle().astype(odtype)]
else:
axis = list(node.op.axis)
cuttable = [a for a in axis if reduced.broadcastable[a]]
if cuttable:
# -- we can remove some axes of summation,
# which simplifies the codegen for sum, especially on GPU
new_axis = []
pattern = []
ii = 0
for p in range(reduced.ndim):
if p not in cuttable:
if p in axis:
new_axis.append(ii)
pattern.append(p)
ii += 1
new_reduced = reduced.dimshuffle(*pattern)
if new_axis:
new_op = node.op.__class__(axis=new_axis)
return [new_op(new_reduced)]
else:
# -- in this case we can remove the reduction completely
return [new_reduced.astype(odtype)]
@register_specialize @register_specialize
@gof.local_optimizer([]) @gof.local_optimizer([])
def local_sum_alloc(node): def local_sum_alloc(node):
......
...@@ -46,6 +46,7 @@ from theano.tensor import ( ...@@ -46,6 +46,7 @@ from theano.tensor import (
) )
from theano.tensor.elemwise import DimShuffle from theano.tensor.elemwise import DimShuffle
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.compile.mode import optdb
mode_opt = theano.config.mode mode_opt = theano.config.mode
if mode_opt == 'FAST_COMPILE': if mode_opt == 'FAST_COMPILE':
...@@ -3288,6 +3289,51 @@ class T_local_sum(unittest.TestCase): ...@@ -3288,6 +3289,51 @@ class T_local_sum(unittest.TestCase):
finally: finally:
config.on_opt_error = backup config.on_opt_error = backup
def test_local_sum_broadcast_all_0(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, True, True))()
env = Env([x], [x.sum()])
optimizer.optimize(env)
assert not any([
isinstance(node.op, T.CAReduce)
for node in env.toposort()])
def test_local_sum_broadcast_all_1(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, True))()
env = Env([x], [x.sum(axis=[0, 1])])
optimizer.optimize(env)
assert not any([
isinstance(node.op, T.CAReduce)
for node in env.toposort()])
def test_local_sum_broadcast_some_0(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, False, True))()
env = Env([x], [x.sum(axis=[0, 1])])
optimizer.optimize(env)
order = env.toposort()
assert 1 == sum([isinstance(node.op, T.CAReduce) for node in order])
op = order[-2].op
assert isinstance(op, T.CAReduce)
# -- the leading broadcastable dimension has been dropped
# by the local_sum_broadcastable optimization
# now summation is over the original x's dimension 1.
assert order[-2].inputs[0].ndim == 2, order[-2]
assert op.axis == (0,), op.axis
def test_local_sum_broadcast_some_1(self):
optimizer = optdb.query(self.mode._optimizer)
x = T.TensorType('int64', (True, False, True))()
env = Env([x], [x.sum(axis=[0, 2])])
optimizer.optimize(env)
order = env.toposort()
assert 0 == sum([isinstance(node.op, T.CAReduce) for node in order])
class T_local_sum_dimshuffle(unittest.TestCase): class T_local_sum_dimshuffle(unittest.TestCase):
def setUp(self): def setUp(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论