提交 b7f4733f authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2444 from daemonmaker/local_alloc_elemwise2

Local alloc elemwise2
......@@ -256,9 +256,23 @@ class GpuElemwise(GpuOp):
_inputs = [as_cuda_ndarray_variable(i) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
target_length = max([input.type.ndim for input in _inputs])
args = []
for input in _inputs:
length = input.type.ndim
difference = target_length - length
if not difference:
args.append(input)
else:
# TODO: use LComplete instead
args.append(GpuDimShuffle(
input.type.broadcastable,
['x'] * difference + range(length)
)(input))
_inputs = args
# output is broadcastable only along dimensions where all
# inputs are broadcastable
......@@ -303,7 +317,7 @@ class GpuDimShuffle(GpuOp):
def __init__(self, input_broadcastable, new_order):
input_broadcastable = tuple(input_broadcastable)
self.input_broadcastable = input_broadcastable
self.new_order = new_order
self.new_order = tuple(new_order)
for i, b in enumerate(input_broadcastable):
if i not in new_order:
......@@ -313,6 +327,13 @@ class GpuDimShuffle(GpuOp):
" dimension.",
(input_broadcastable, new_order))
# this is the list of the original dimensions that we keep
self.shuffle = [x for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not
# in the original input
self.augment = [i for i, x in enumerate(new_order) if x == 'x']
self.view_map = {0: [0]}
self._rehash()
......@@ -344,8 +365,7 @@ class GpuDimShuffle(GpuOp):
# Both case are good.
ob = []
if not isinstance(input.type, CudaNdarrayType):
raise TypeError("The input of a GpuDimshuffle must"
" be a CudaNdarray")
input = as_cuda_ndarray_variable(input)
for value in self.new_order:
if value == 'x':
ob.append(True)
......@@ -486,6 +506,17 @@ class GpuDimShuffle(GpuOp):
def c_code_cache_version(self):
return (1, 0)
def infer_shape(self, node, shapes):
ishp, = shapes
# transpose
rval = [ishp[i] for i in self.shuffle]
# augment
for augm in self.augment:
rval.insert(augm, 1)
return [rval]
class GpuCAReduce(GpuOp):
"""GpuCAReduce is a Reduction along some dimensions by a scalar op.
......@@ -3228,9 +3259,7 @@ class GpuAlloc(GpuOp):
v = as_cuda_ndarray_variable(value)
sh = [tensor.as_tensor_variable(s) for s in shape]
if v.ndim != len(shape):
raise TypeError(
'GpuAlloc requires value of same dimensions as shape',
value, len(shape))
value = tensor.shape_padleft(value, len(shape) - v.ndim)
bcast = []
for s in sh:
......
......@@ -1814,6 +1814,14 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
'fast_run', 'inplace', 'gpu_inplace')
register_opt()(tensor.opt.local_remove_useless_assert)
register_opt()(tensor.opt.local_shape_to_shape_i)
gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
)
register_opt()(gpu_elemwise_alloc)
tensor.opt.register_specialize_device(gpu_elemwise_alloc)
@register_opt()
@local_optimizer([tensor.alloc])
......@@ -1841,8 +1849,8 @@ def local_gpualloc(node):
val = node.inputs[0]
shp = node.inputs[1:]
old_out = node.outputs[0]
val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
new_out = host_from_gpu(gpu_alloc(val2, *shp))
new_out = host_from_gpu(gpu_alloc(val, *shp))
# Sigh. it's an annoying thing about theano
# that you can't add information to the graph.
# If for some reason it has come to light that
......
import operator
import sys
import unittest
import numpy
# Skip test if cuda_ndarray is not available.
......@@ -9,6 +10,7 @@ import theano
from theano.compile.pfunc import pfunc
from theano import config, tensor
import theano.tensor.tests.test_nlinalg
import theano.tensor.tests.test_opt as test_opt
from theano.tests import unittest_tools as utt
......@@ -86,6 +88,50 @@ def test_gpualloc():
assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
dtype = 'float32'
def setUp(self):
super(Test_local_elemwise_alloc, self).setUp()
self.fast_run_mode = mode_with_gpu
#self.vec = tensor.vector('vec', dtype=dtype)
#self.mat = tensor.matrix('mat', dtype=dtype)
#self.tens = tensor.tensor3('tens', dtype=dtype)
#self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
#self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
self.alloc_w_dep_tens = basic_ops.gpu_alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
)
self.tv_wo_dep = basic_ops.gpu_alloc(self.vec, 5, 5)
self.tm_wo_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
self.s = tensor.iscalar('s')
self.tv_w_dep = basic_ops.gpu_alloc(self.vec, self.s, self.s)
self.tm_w_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
self.row = tensor.row(dtype=self.dtype)
self.o = basic_ops.gpu_alloc(self.row, 5, 5)
def _verify_alloc_count(self, f, count):
assert(
sum([isinstance(elem.op, basic_ops.GpuAlloc)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def _verify_assert_count(self, f, count):
assert(
sum([isinstance(elem.op, tensor.opt.Assert)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def test_alloc_memset_0():
i = tensor.iscalar()
z = numpy.zeros((1,), dtype='float32')
......
......@@ -2767,12 +2767,27 @@ class Test_local_elemwise_alloc(unittest.TestCase):
dtype = config.floatX
def setUp(self):
self.vec = T.vector('vec', dtype=theano.config.floatX)
self.mat = T.matrix('mat', dtype=theano.config.floatX)
self.tens = T.tensor3('tens', dtype=theano.config.floatX)
self.fast_compile_mode = 'FAST_COMPILE'
self.fast_run_mode = 'FAST_RUN'
self.vec = T.vector('vec', dtype=self.dtype)
self.mat = T.matrix('mat', dtype=self.dtype)
self.tens = T.tensor3('tens', dtype=self.dtype)
self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
self.alloc_w_dep_tens = T.alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
)
self.tv_wo_dep = T.alloc(self.vec, 5, 5)
self.tm_wo_dep = T.alloc(self.mat, 5, 5, 5)
self.s = T.iscalar('s')
self.tv_w_dep = T.alloc(self.vec, self.s, self.s)
self.tm_w_dep = T.alloc(self.mat, 5, 5, 5)
self.row = theano.tensor.row(dtype=self.dtype)
self.o = T.alloc(self.row, 5, 5)
def _verify_alloc_count(self, f, count):
assert(
......@@ -2793,7 +2808,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function(
[self.vec, self.mat],
self.alloc_wo_dep + self.mat,
mode='FAST_COMPILE'
mode=self.fast_compile_mode
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
......@@ -2802,7 +2817,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function(
[self.vec, self.mat],
self.alloc_wo_dep + self.mat,
mode='FAST_RUN'
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1)
......@@ -2811,7 +2826,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function(
[self.vec, self.mat],
self.alloc_w_dep + self.mat,
mode='FAST_COMPILE'
mode=self.fast_compile_mode
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
......@@ -2820,7 +2835,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
func = function(
[self.vec, self.mat],
self.alloc_w_dep + self. mat,
mode='FAST_RUN'
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0)
......@@ -2829,8 +2844,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# No optimization on dimshuffle with assert
func = function(
[self.vec, self.tens],
T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_COMPILE'
self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
mode=self.fast_compile_mode
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
......@@ -2838,8 +2853,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# Optimization on dimshuffle with assert
func = function(
[self.vec, self.tens],
T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_RUN'
self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1)
......@@ -2847,12 +2862,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# No optimization on dimshuffle without assert
func = function(
[self.vec, self.tens],
T.alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_COMPILE'
self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
mode=self.fast_compile_mode
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
......@@ -2860,52 +2871,45 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# Optimization on dimshuffle without assert
func = function(
[self.vec, self.tens],
T.alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_RUN'
self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0)
def test_multi_input_single_alloc(self):
tv = T.alloc(self.vec, 5, 5)
tm = T.alloc(self.mat, 5, 5, 5)
# No optimization on dimshuffle with assert
func = function(
[self.vec, self.mat],
tv + tm,
mode='FAST_COMPILE'
self.tv_wo_dep + self.tm_wo_dep,
mode=self.fast_compile_mode
)
self._verify_alloc_count(func, 2)
self._verify_assert_count(func, 0)
# Optimization on dimshuffle with assert
func = function(
[self.vec, self.mat],
tv + tm,
mode='FAST_RUN'
self.tv_wo_dep + self.tm_wo_dep,
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
s = T.iscalar('s')
tv = T.alloc(self.vec, s, s)
tm = T.alloc(self.mat, 5, 5, 5)
# No optimization on dimshuffle without assert
func = function(
[self.vec, self.mat, s],
tv + tm,
mode='FAST_COMPILE'
[self.vec, self.mat, self.s],
self.tv_w_dep + self.tm_w_dep,
mode=self.fast_compile_mode
)
self._verify_alloc_count(func, 2)
self._verify_assert_count(func, 0)
# Optimization on dimshuffle without assert
func = function(
[self.vec, self.mat, s],
tv + tm,
mode='FAST_RUN'
[self.vec, self.mat, self.s],
self.tv_w_dep + self.tm_w_dep,
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 1)
......@@ -2913,12 +2917,11 @@ class Test_local_elemwise_alloc(unittest.TestCase):
def test_error(self):
t3fft = theano.tensor.tensor(dtype=self.dtype,
broadcastable=(False, False, True))
row = theano.tensor.row(dtype=self.dtype)
o = T.alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
o = self.o.dimshuffle(0, 1, 'x') + t3fft
func = function(
[t3fft, row],
[t3fft, self.row],
o,
mode='FAST_RUN'
mode=self.fast_run_mode
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论