Merge pull request #2444 from daemonmaker/local_alloc_elemwise2

Local alloc elemwise2

Merge pull request #2444 from daemonmaker/local_alloc_elemwise2
b7f4733f · Frédéric Bastien · c40fff12 · be5368e3 · b7f4733f · b7f4733f
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -256,9 +256,23 @@ class GpuElemwise(GpuOp):
        _inputs = [as_cuda_ndarray_variable(i) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('different ranks among inputs')
+
+        target_length = max([input.type.ndim for input in _inputs])
+
+        args = []
+        for input in _inputs:
+            length = input.type.ndim
+            difference = target_length - length
+            if not difference:
+                args.append(input)
+            else:
+                # TODO: use LComplete instead
+                args.append(GpuDimShuffle(
+                    input.type.broadcastable,
+                    ['x'] * difference + range(length)
+                    )(input))
+        _inputs = args
+

        # output is broadcastable only along dimensions where all
        # inputs are broadcastable
@@ -303,7 +317,7 @@ class GpuDimShuffle(GpuOp):
    def __init__(self, input_broadcastable, new_order):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
-        self.new_order = new_order
+        self.new_order = tuple(new_order)

        for i, b in enumerate(input_broadcastable):
            if i not in new_order:
@@ -313,6 +327,13 @@ class GpuDimShuffle(GpuOp):
                                     " dimension.",
                                     (input_broadcastable, new_order))

+        # this is the list of the original dimensions that we keep
+        self.shuffle = [x for x in new_order if x != 'x']
+
+        # list of dimensions of the output that are broadcastable and were not
+        # in the original input
+        self.augment = [i for i, x in enumerate(new_order) if x == 'x']
+
        self.view_map = {0: [0]}

        self._rehash()
@@ -344,8 +365,7 @@ class GpuDimShuffle(GpuOp):
                # Both case are good.
        ob = []
        if not isinstance(input.type, CudaNdarrayType):
-            raise TypeError("The input of a GpuDimshuffle must"
-                            " be a CudaNdarray")
+            input = as_cuda_ndarray_variable(input)
        for value in self.new_order:
            if value == 'x':
                ob.append(True)
@@ -486,6 +506,17 @@ class GpuDimShuffle(GpuOp):
    def c_code_cache_version(self):
        return (1, 0)

+    def infer_shape(self, node, shapes):
+        ishp, = shapes
+        # transpose
+        rval = [ishp[i] for i in self.shuffle]
+
+        # augment
+        for augm in self.augment:
+            rval.insert(augm, 1)
+        return [rval]
+
+

 class GpuCAReduce(GpuOp):
    """GpuCAReduce is a Reduction along some dimensions by a scalar op.
@@ -3228,9 +3259,7 @@ class GpuAlloc(GpuOp):
        v = as_cuda_ndarray_variable(value)
        sh = [tensor.as_tensor_variable(s) for s in shape]
        if v.ndim != len(shape):
-            raise TypeError(
-                'GpuAlloc requires value of same dimensions as shape',
-                value, len(shape))
+            value = tensor.shape_padleft(value, len(shape) - v.ndim)

        bcast = []
        for s in sh:

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1814,6 +1814,14 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
 optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
               'fast_run', 'inplace', 'gpu_inplace')

+register_opt()(tensor.opt.local_remove_useless_assert)
+
+register_opt()(tensor.opt.local_shape_to_shape_i)
+gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
+    tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
+)
+register_opt()(gpu_elemwise_alloc)
+tensor.opt.register_specialize_device(gpu_elemwise_alloc)

 @register_opt()
 @local_optimizer([tensor.alloc])
@@ -1841,8 +1849,8 @@ def local_gpualloc(node):
        val = node.inputs[0]
        shp = node.inputs[1:]
        old_out = node.outputs[0]
-        val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
-        new_out = host_from_gpu(gpu_alloc(val2, *shp))
+        new_out = host_from_gpu(gpu_alloc(val, *shp))
+
        # Sigh. it's an annoying thing about theano
        # that you can't add information to the graph.
        # If for some reason it has come to light that

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
 import operator
 import sys
+import unittest

 import numpy
 # Skip test if cuda_ndarray is not available.
@@ -9,6 +10,7 @@ import theano
 from theano.compile.pfunc import pfunc
 from theano import config, tensor
 import theano.tensor.tests.test_nlinalg
+import theano.tensor.tests.test_opt as test_opt

 from theano.tests import unittest_tools as utt

@@ -86,6 +88,50 @@ def test_gpualloc():
    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])


+class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
+    dtype = 'float32'
+
+    def setUp(self):
+        super(Test_local_elemwise_alloc, self).setUp()
+        self.fast_run_mode = mode_with_gpu
+
+        #self.vec = tensor.vector('vec', dtype=dtype)
+        #self.mat = tensor.matrix('mat', dtype=dtype)
+        #self.tens = tensor.tensor3('tens', dtype=dtype)
+
+        #self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
+        #self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
+
+        self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
+        self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
+        self.alloc_w_dep_tens = basic_ops.gpu_alloc(
+            self.vec,
+            self.tens.shape[0],
+            self.tens.shape[1]
+        )
+        self.tv_wo_dep = basic_ops.gpu_alloc(self.vec, 5, 5)
+        self.tm_wo_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
+        self.s = tensor.iscalar('s')
+        self.tv_w_dep = basic_ops.gpu_alloc(self.vec, self.s, self.s)
+        self.tm_w_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
+        self.row = tensor.row(dtype=self.dtype)
+        self.o = basic_ops.gpu_alloc(self.row, 5, 5)
+
+    def _verify_alloc_count(self, f, count):
+        assert(
+            sum([isinstance(elem.op, basic_ops.GpuAlloc)
+                 for elem in f.maker.fgraph.toposort()
+                 if elem.op is not None]) == count
+        )
+
+    def _verify_assert_count(self, f, count):
+        assert(
+            sum([isinstance(elem.op, tensor.opt.Assert)
+                 for elem in f.maker.fgraph.toposort()
+                 if elem.op is not None]) == count
+        )
+
+
 def test_alloc_memset_0():
    i = tensor.iscalar()
    z = numpy.zeros((1,), dtype='float32')

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2767,12 +2767,27 @@ class Test_local_elemwise_alloc(unittest.TestCase):
    dtype = config.floatX

    def setUp(self):
-        self.vec = T.vector('vec', dtype=theano.config.floatX)
-        self.mat = T.matrix('mat', dtype=theano.config.floatX)
-        self.tens = T.tensor3('tens', dtype=theano.config.floatX)
+        self.fast_compile_mode = 'FAST_COMPILE'
+        self.fast_run_mode = 'FAST_RUN'
+
+        self.vec = T.vector('vec', dtype=self.dtype)
+        self.mat = T.matrix('mat', dtype=self.dtype)
+        self.tens = T.tensor3('tens', dtype=self.dtype)

        self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
        self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
+        self.alloc_w_dep_tens = T.alloc(
+            self.vec,
+            self.tens.shape[0],
+            self.tens.shape[1]
+        )
+        self.tv_wo_dep = T.alloc(self.vec, 5, 5)
+        self.tm_wo_dep = T.alloc(self.mat, 5, 5, 5)
+        self.s = T.iscalar('s')
+        self.tv_w_dep = T.alloc(self.vec, self.s, self.s)
+        self.tm_w_dep = T.alloc(self.mat, 5, 5, 5)
+        self.row = theano.tensor.row(dtype=self.dtype)
+        self.o = T.alloc(self.row, 5, 5)

    def _verify_alloc_count(self, f, count):
        assert(
@@ -2793,7 +2808,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_wo_dep + self.mat,
-            mode='FAST_COMPILE'
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2802,7 +2817,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_wo_dep + self.mat,
-            mode='FAST_RUN'
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 1)
@@ -2811,7 +2826,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_w_dep + self.mat,
-            mode='FAST_COMPILE'
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2820,7 +2835,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_w_dep + self. mat,
-            mode='FAST_RUN'
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 0)
@@ -2829,8 +2844,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # No optimization on dimshuffle with assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_COMPILE'
+            self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2838,8 +2853,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # Optimization on dimshuffle with assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_RUN'
+            self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 1)
@@ -2847,12 +2862,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # No optimization on dimshuffle without assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(
-                self.vec,
-                self.tens.shape[0],
-                self.tens.shape[1]
-            ).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_COMPILE'
+            self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2860,52 +2871,45 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # Optimization on dimshuffle without assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(
-                self.vec,
-                self.tens.shape[0],
-                self.tens.shape[1]
-            ).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_RUN'
+            self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 0)

    def test_multi_input_single_alloc(self):
-        tv = T.alloc(self.vec, 5, 5)
-        tm = T.alloc(self.mat, 5, 5, 5)
+        # No optimization on dimshuffle with assert
        func = function(
            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_COMPILE'
+            self.tv_wo_dep + self.tm_wo_dep,
+            mode=self.fast_compile_mode
        )
-
        self._verify_alloc_count(func, 2)
        self._verify_assert_count(func, 0)

+        # Optimization on dimshuffle with assert
        func = function(
            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_RUN'
+            self.tv_wo_dep + self.tm_wo_dep,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)

-        s = T.iscalar('s')
-        tv = T.alloc(self.vec, s, s)
-        tm = T.alloc(self.mat, 5, 5, 5)
+        # No optimization on dimshuffle without assert
        func = function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_COMPILE'
+            [self.vec, self.mat, self.s],
+            self.tv_w_dep + self.tm_w_dep,
+            mode=self.fast_compile_mode
        )
-
        self._verify_alloc_count(func, 2)
        self._verify_assert_count(func, 0)

+        # Optimization on dimshuffle without assert
        func = function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_RUN'
+            [self.vec, self.mat, self.s],
+            self.tv_w_dep + self.tm_w_dep,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 1)
@@ -2913,12 +2917,11 @@ class Test_local_elemwise_alloc(unittest.TestCase):
    def test_error(self):
        t3fft = theano.tensor.tensor(dtype=self.dtype,
                                     broadcastable=(False, False, True))
-        row = theano.tensor.row(dtype=self.dtype)
-        o = T.alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
+        o = self.o.dimshuffle(0, 1, 'x') + t3fft
        func = function(
-            [t3fft, row],
+            [t3fft, self.row],
            o,
-            mode='FAST_RUN'
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 1)