提交 b7f4733f authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2444 from daemonmaker/local_alloc_elemwise2

Local alloc elemwise2
...@@ -256,9 +256,23 @@ class GpuElemwise(GpuOp): ...@@ -256,9 +256,23 @@ class GpuElemwise(GpuOp):
_inputs = [as_cuda_ndarray_variable(i) for i in inputs] _inputs = [as_cuda_ndarray_variable(i) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin: if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs))) raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim: target_length = max([input.type.ndim for input in _inputs])
raise TypeError('different ranks among inputs')
args = []
for input in _inputs:
length = input.type.ndim
difference = target_length - length
if not difference:
args.append(input)
else:
# TODO: use LComplete instead
args.append(GpuDimShuffle(
input.type.broadcastable,
['x'] * difference + range(length)
)(input))
_inputs = args
# output is broadcastable only along dimensions where all # output is broadcastable only along dimensions where all
# inputs are broadcastable # inputs are broadcastable
...@@ -303,7 +317,7 @@ class GpuDimShuffle(GpuOp): ...@@ -303,7 +317,7 @@ class GpuDimShuffle(GpuOp):
def __init__(self, input_broadcastable, new_order): def __init__(self, input_broadcastable, new_order):
input_broadcastable = tuple(input_broadcastable) input_broadcastable = tuple(input_broadcastable)
self.input_broadcastable = input_broadcastable self.input_broadcastable = input_broadcastable
self.new_order = new_order self.new_order = tuple(new_order)
for i, b in enumerate(input_broadcastable): for i, b in enumerate(input_broadcastable):
if i not in new_order: if i not in new_order:
...@@ -313,6 +327,13 @@ class GpuDimShuffle(GpuOp): ...@@ -313,6 +327,13 @@ class GpuDimShuffle(GpuOp):
" dimension.", " dimension.",
(input_broadcastable, new_order)) (input_broadcastable, new_order))
# this is the list of the original dimensions that we keep
self.shuffle = [x for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not
# in the original input
self.augment = [i for i, x in enumerate(new_order) if x == 'x']
self.view_map = {0: [0]} self.view_map = {0: [0]}
self._rehash() self._rehash()
...@@ -344,8 +365,7 @@ class GpuDimShuffle(GpuOp): ...@@ -344,8 +365,7 @@ class GpuDimShuffle(GpuOp):
# Both case are good. # Both case are good.
ob = [] ob = []
if not isinstance(input.type, CudaNdarrayType): if not isinstance(input.type, CudaNdarrayType):
raise TypeError("The input of a GpuDimshuffle must" input = as_cuda_ndarray_variable(input)
" be a CudaNdarray")
for value in self.new_order: for value in self.new_order:
if value == 'x': if value == 'x':
ob.append(True) ob.append(True)
...@@ -486,6 +506,17 @@ class GpuDimShuffle(GpuOp): ...@@ -486,6 +506,17 @@ class GpuDimShuffle(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, 0) return (1, 0)
def infer_shape(self, node, shapes):
ishp, = shapes
# transpose
rval = [ishp[i] for i in self.shuffle]
# augment
for augm in self.augment:
rval.insert(augm, 1)
return [rval]
class GpuCAReduce(GpuOp): class GpuCAReduce(GpuOp):
"""GpuCAReduce is a Reduction along some dimensions by a scalar op. """GpuCAReduce is a Reduction along some dimensions by a scalar op.
...@@ -3228,9 +3259,7 @@ class GpuAlloc(GpuOp): ...@@ -3228,9 +3259,7 @@ class GpuAlloc(GpuOp):
v = as_cuda_ndarray_variable(value) v = as_cuda_ndarray_variable(value)
sh = [tensor.as_tensor_variable(s) for s in shape] sh = [tensor.as_tensor_variable(s) for s in shape]
if v.ndim != len(shape): if v.ndim != len(shape):
raise TypeError( value = tensor.shape_padleft(value, len(shape) - v.ndim)
'GpuAlloc requires value of same dimensions as shape',
value, len(shape))
bcast = [] bcast = []
for s in sh: for s in sh:
......
...@@ -1814,6 +1814,14 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op( ...@@ -1814,6 +1814,14 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75, optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
'fast_run', 'inplace', 'gpu_inplace') 'fast_run', 'inplace', 'gpu_inplace')
register_opt()(tensor.opt.local_remove_useless_assert)
register_opt()(tensor.opt.local_shape_to_shape_i)
gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
)
register_opt()(gpu_elemwise_alloc)
tensor.opt.register_specialize_device(gpu_elemwise_alloc)
@register_opt() @register_opt()
@local_optimizer([tensor.alloc]) @local_optimizer([tensor.alloc])
...@@ -1841,8 +1849,8 @@ def local_gpualloc(node): ...@@ -1841,8 +1849,8 @@ def local_gpualloc(node):
val = node.inputs[0] val = node.inputs[0]
shp = node.inputs[1:] shp = node.inputs[1:]
old_out = node.outputs[0] old_out = node.outputs[0]
val2 = tensor.shape_padleft(val, len(shp) - val.ndim) new_out = host_from_gpu(gpu_alloc(val, *shp))
new_out = host_from_gpu(gpu_alloc(val2, *shp))
# Sigh. it's an annoying thing about theano # Sigh. it's an annoying thing about theano
# that you can't add information to the graph. # that you can't add information to the graph.
# If for some reason it has come to light that # If for some reason it has come to light that
......
import operator import operator
import sys import sys
import unittest
import numpy import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
...@@ -9,6 +10,7 @@ import theano ...@@ -9,6 +10,7 @@ import theano
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import config, tensor from theano import config, tensor
import theano.tensor.tests.test_nlinalg import theano.tensor.tests.test_nlinalg
import theano.tensor.tests.test_opt as test_opt
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
...@@ -86,6 +88,50 @@ def test_gpualloc(): ...@@ -86,6 +88,50 @@ def test_gpualloc():
assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l]) assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
dtype = 'float32'
def setUp(self):
super(Test_local_elemwise_alloc, self).setUp()
self.fast_run_mode = mode_with_gpu
#self.vec = tensor.vector('vec', dtype=dtype)
#self.mat = tensor.matrix('mat', dtype=dtype)
#self.tens = tensor.tensor3('tens', dtype=dtype)
#self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
#self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
self.alloc_w_dep_tens = basic_ops.gpu_alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
)
self.tv_wo_dep = basic_ops.gpu_alloc(self.vec, 5, 5)
self.tm_wo_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
self.s = tensor.iscalar('s')
self.tv_w_dep = basic_ops.gpu_alloc(self.vec, self.s, self.s)
self.tm_w_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
self.row = tensor.row(dtype=self.dtype)
self.o = basic_ops.gpu_alloc(self.row, 5, 5)
def _verify_alloc_count(self, f, count):
assert(
sum([isinstance(elem.op, basic_ops.GpuAlloc)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def _verify_assert_count(self, f, count):
assert(
sum([isinstance(elem.op, tensor.opt.Assert)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def test_alloc_memset_0(): def test_alloc_memset_0():
i = tensor.iscalar() i = tensor.iscalar()
z = numpy.zeros((1,), dtype='float32') z = numpy.zeros((1,), dtype='float32')
......
...@@ -2767,12 +2767,27 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2767,12 +2767,27 @@ class Test_local_elemwise_alloc(unittest.TestCase):
dtype = config.floatX dtype = config.floatX
def setUp(self): def setUp(self):
self.vec = T.vector('vec', dtype=theano.config.floatX) self.fast_compile_mode = 'FAST_COMPILE'
self.mat = T.matrix('mat', dtype=theano.config.floatX) self.fast_run_mode = 'FAST_RUN'
self.tens = T.tensor3('tens', dtype=theano.config.floatX)
self.vec = T.vector('vec', dtype=self.dtype)
self.mat = T.matrix('mat', dtype=self.dtype)
self.tens = T.tensor3('tens', dtype=self.dtype)
self.alloc_wo_dep = T.alloc(self.vec, 2, 2) self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape) self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
self.alloc_w_dep_tens = T.alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
)
self.tv_wo_dep = T.alloc(self.vec, 5, 5)
self.tm_wo_dep = T.alloc(self.mat, 5, 5, 5)
self.s = T.iscalar('s')
self.tv_w_dep = T.alloc(self.vec, self.s, self.s)
self.tm_w_dep = T.alloc(self.mat, 5, 5, 5)
self.row = theano.tensor.row(dtype=self.dtype)
self.o = T.alloc(self.row, 5, 5)
def _verify_alloc_count(self, f, count): def _verify_alloc_count(self, f, count):
assert( assert(
...@@ -2793,7 +2808,7 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2793,7 +2808,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function( func = function(
[self.vec, self.mat], [self.vec, self.mat],
self.alloc_wo_dep + self.mat, self.alloc_wo_dep + self.mat,
mode='FAST_COMPILE' mode=self.fast_compile_mode
) )
self._verify_alloc_count(func, 1) self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
...@@ -2802,7 +2817,7 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2802,7 +2817,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function( func = function(
[self.vec, self.mat], [self.vec, self.mat],
self.alloc_wo_dep + self.mat, self.alloc_wo_dep + self.mat,
mode='FAST_RUN' mode=self.fast_run_mode
) )
self._verify_alloc_count(func, 0) self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1) self._verify_assert_count(func, 1)
...@@ -2811,7 +2826,7 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2811,7 +2826,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function( func = function(
[self.vec, self.mat], [self.vec, self.mat],
self.alloc_w_dep + self.mat, self.alloc_w_dep + self.mat,
mode='FAST_COMPILE' mode=self.fast_compile_mode
) )
self._verify_alloc_count(func, 1) self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
...@@ -2820,7 +2835,7 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2820,7 +2835,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function( func = function(
[self.vec, self.mat], [self.vec, self.mat],
self.alloc_w_dep + self. mat, self.alloc_w_dep + self. mat,
mode='FAST_RUN' mode=self.fast_run_mode
) )
self._verify_alloc_count(func, 0) self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
...@@ -2829,8 +2844,8 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2829,8 +2844,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# No optimization on dimshuffle with assert # No optimization on dimshuffle with assert
func = function( func = function(
[self.vec, self.tens], [self.vec, self.tens],
T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens, self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_COMPILE' mode=self.fast_compile_mode
) )
self._verify_alloc_count(func, 1) self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
...@@ -2838,8 +2853,8 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2838,8 +2853,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# Optimization on dimshuffle with assert # Optimization on dimshuffle with assert
func = function( func = function(
[self.vec, self.tens], [self.vec, self.tens],
T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens, self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_RUN' mode=self.fast_run_mode
) )
self._verify_alloc_count(func, 0) self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1) self._verify_assert_count(func, 1)
...@@ -2847,12 +2862,8 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2847,12 +2862,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# No optimization on dimshuffle without assert # No optimization on dimshuffle without assert
func = function( func = function(
[self.vec, self.tens], [self.vec, self.tens],
T.alloc( self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
self.vec, mode=self.fast_compile_mode
self.tens.shape[0],
self.tens.shape[1]
).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_COMPILE'
) )
self._verify_alloc_count(func, 1) self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
...@@ -2860,52 +2871,45 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2860,52 +2871,45 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# Optimization on dimshuffle without assert # Optimization on dimshuffle without assert
func = function( func = function(
[self.vec, self.tens], [self.vec, self.tens],
T.alloc( self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
self.vec, mode=self.fast_run_mode
self.tens.shape[0],
self.tens.shape[1]
).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_RUN'
) )
self._verify_alloc_count(func, 0) self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
def test_multi_input_single_alloc(self): def test_multi_input_single_alloc(self):
tv = T.alloc(self.vec, 5, 5) # No optimization on dimshuffle with assert
tm = T.alloc(self.mat, 5, 5, 5)
func = function( func = function(
[self.vec, self.mat], [self.vec, self.mat],
tv + tm, self.tv_wo_dep + self.tm_wo_dep,
mode='FAST_COMPILE' mode=self.fast_compile_mode
) )
self._verify_alloc_count(func, 2) self._verify_alloc_count(func, 2)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
# Optimization on dimshuffle with assert
func = function( func = function(
[self.vec, self.mat], [self.vec, self.mat],
tv + tm, self.tv_wo_dep + self.tm_wo_dep,
mode='FAST_RUN' mode=self.fast_run_mode
) )
self._verify_alloc_count(func, 1) self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
s = T.iscalar('s') # No optimization on dimshuffle without assert
tv = T.alloc(self.vec, s, s)
tm = T.alloc(self.mat, 5, 5, 5)
func = function( func = function(
[self.vec, self.mat, s], [self.vec, self.mat, self.s],
tv + tm, self.tv_w_dep + self.tm_w_dep,
mode='FAST_COMPILE' mode=self.fast_compile_mode
) )
self._verify_alloc_count(func, 2) self._verify_alloc_count(func, 2)
self._verify_assert_count(func, 0) self._verify_assert_count(func, 0)
# Optimization on dimshuffle without assert
func = function( func = function(
[self.vec, self.mat, s], [self.vec, self.mat, self.s],
tv + tm, self.tv_w_dep + self.tm_w_dep,
mode='FAST_RUN' mode=self.fast_run_mode
) )
self._verify_alloc_count(func, 1) self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 1) self._verify_assert_count(func, 1)
...@@ -2913,12 +2917,11 @@ class Test_local_elemwise_alloc(unittest.TestCase): ...@@ -2913,12 +2917,11 @@ class Test_local_elemwise_alloc(unittest.TestCase):
def test_error(self): def test_error(self):
t3fft = theano.tensor.tensor(dtype=self.dtype, t3fft = theano.tensor.tensor(dtype=self.dtype,
broadcastable=(False, False, True)) broadcastable=(False, False, True))
row = theano.tensor.row(dtype=self.dtype) o = self.o.dimshuffle(0, 1, 'x') + t3fft
o = T.alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
func = function( func = function(
[t3fft, row], [t3fft, self.row],
o, o,
mode='FAST_RUN' mode=self.fast_run_mode
) )
self._verify_alloc_count(func, 0) self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1) self._verify_assert_count(func, 1)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论