Generalized the test code to work for both the CPU and GPU implementations.

There is still one problem in the tests to work out though so this is not ready to merge.

Generalized the test code to work for both the CPU and GPU implementations.
6c23f17d · Dustin Webb · 5b6dd257 · 6c23f17d · 6c23f17d · 6c23f17d
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -256,9 +256,23 @@ class GpuElemwise(GpuOp):
        _inputs = [as_cuda_ndarray_variable(i) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('different ranks among inputs')
+
+        target_length = max([input.type.ndim for input in _inputs])
+
+        args = []
+        for input in _inputs:
+            length = input.type.ndim
+            difference = target_length - length
+            if not difference:
+                args.append(input)
+            else:
+                # TODO: use LComplete instead
+                args.append(GpuDimShuffle(
+                    input.type.broadcastable,
+                    ['x'] * difference + range(length)
+                    )(input))
+        _inputs = args
+

        # output is broadcastable only along dimensions where all
        # inputs are broadcastable
@@ -303,7 +317,7 @@ class GpuDimShuffle(GpuOp):
    def __init__(self, input_broadcastable, new_order):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
-        self.new_order = new_order
+        self.new_order = tuple(new_order)

        for i, b in enumerate(input_broadcastable):
            if i not in new_order:
@@ -351,8 +365,7 @@ class GpuDimShuffle(GpuOp):
                # Both case are good.
        ob = []
        if not isinstance(input.type, CudaNdarrayType):
-            raise TypeError("The input of a GpuDimshuffle must"
-                            " be a CudaNdarray")
+            input = as_cuda_ndarray_variable(input)
        for value in self.new_order:
            if value == 'x':
                ob.append(True)
@@ -3246,9 +3259,7 @@ class GpuAlloc(GpuOp):
        v = as_cuda_ndarray_variable(value)
        sh = [tensor.as_tensor_variable(s) for s in shape]
        if v.ndim != len(shape):
-            raise TypeError(
-                'GpuAlloc requires value of same dimensions as shape',
-                value, len(shape))
+            value = tensor.shape_padleft(value, len(shape) - v.ndim)

        bcast = []
        for s in sh:

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1814,7 +1814,7 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
 optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
               'fast_run', 'inplace', 'gpu_inplace')

-tensor.opt.register_specialize_device(tensor.opt.local_shape_to_shape_i)
+register_opt()(tensor.opt.local_shape_to_shape_i)
 gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
    tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
 )
@@ -1847,8 +1847,8 @@ def local_gpualloc(node):
        val = node.inputs[0]
        shp = node.inputs[1:]
        old_out = node.outputs[0]
-        val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
-        new_out = host_from_gpu(gpu_alloc(val2, *shp))
+        new_out = host_from_gpu(gpu_alloc(val, *shp))
+
        # Sigh. it's an annoying thing about theano
        # that you can't add information to the graph.
        # If for some reason it has come to light that

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -10,6 +10,7 @@ import theano
 from theano.compile.pfunc import pfunc
 from theano import config, tensor
 import theano.tensor.tests.test_nlinalg
+import theano.tensor.tests.test_opt as test_opt

 from theano.tests import unittest_tools as utt

@@ -87,16 +88,34 @@ def test_gpualloc():
    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])


-class Test_local_elemwise_alloc(unittest.TestCase):
-    dtype = config.floatX
+class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
+    dtype = 'float32'

    def setUp(self):
-        self.vec = tensor.vector('vec', dtype=theano.config.floatX)
-        self.mat = tensor.matrix('mat', dtype=theano.config.floatX)
-        self.tens = tensor.tensor3('tens', dtype=theano.config.floatX)
-
-        self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2)
-        self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.vec.shape)
+        super(Test_local_elemwise_alloc, self).setUp()
+        self.fast_run_mode = mode_with_gpu
+
+        #self.vec = tensor.vector('vec', dtype=dtype)
+        #self.mat = tensor.matrix('mat', dtype=dtype)
+        #self.tens = tensor.tensor3('tens', dtype=dtype)
+
+        #self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
+        #self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
+
+        self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
+        self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
+        self.alloc_w_dep_tens = basic_ops.gpu_alloc(
+            self.vec,
+            self.tens.shape[0],
+            self.tens.shape[1]
+        )
+        self.tv_wo_dep = basic_ops.gpu_alloc(self.vec, 5, 5)
+        self.tm_wo_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
+        self.s = tensor.iscalar('s')
+        self.tv_w_dep = basic_ops.gpu_alloc(self.vec, self.s, self.s)
+        self.tm_w_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
+        self.row = tensor.row(dtype=self.dtype)
+        self.o = basic_ops.gpu_alloc(self.row, 5, 5)

    def _verify_alloc_count(self, f, count):
        assert(
@@ -112,150 +131,6 @@ class Test_local_elemwise_alloc(unittest.TestCase):
                 if elem.op is not None]) == count
        )

-    def test_remove_alloc_wo_dimshuffle(self):
-        # No optimization on alloc
-        from theano.printing import debugprint as dp
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_wo_dep + self.mat,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on alloc with assert
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_wo_dep + self.mat,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 1)
-
-        # No optimization on alloc without assert
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_w_dep + self.mat,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on alloc without assert
-        temp_val = theano.config.experimental.local_alloc_elemwise_assert
-        theano.config.experimental.local_alloc_elemwise_assert = False
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_w_dep + self. mat,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 0)
-        theano.config.experimental.local_alloc_elemwise_assert = temp_val
-
-    def test_remove_alloc_w_dimshuffle(self):
-        # No optimization on dimshuffle with assert
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_wo_dep.dimshuffle(0, 'x') + self.mat,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on dimshuffle with assert
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_wo_dep.dimshuffle(0, 'x') + self.mat,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 1)
-
-        # No optimization on dimshuffle without assert
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_w_dep.dimshuffle(0, 'x') + self.mat,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on dimshuffle without assert
-        temp_val = theano.config.experimental.local_alloc_elemwise_assert
-        theano.config.experimental.local_alloc_elemwise_assert = False
-        func = theano.function(
-            [self.vec, self.mat],
-            self.alloc_w_dep + self. mat,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 0)
-        theano.config.experimental.local_alloc_elemwise_assert = temp_val
-
-    def test_multi_input_single_alloc(self):
-        # No optimization on dimshuffle with assert
-        tv = basic_ops.gpu_alloc(self.vec, 5)
-        tm = basic_ops.gpu_alloc(self.mat, 5, 5)
-        func = theano.function(
-            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 2)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on dimshuffle with assert
-        func = theano.function(
-            [self.vec, self.mat],
-            tv + tm,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 1)
-
-        # No optimization on dimshuffle without assert
-        s = tensor.iscalar('s')
-        #tv = tensor.alloc(self.vec, s, s)
-        #tm = tensor.alloc(self.mat, 5, 5, 5)
-        tv = basic_ops.gpu_alloc(self.vec, s)
-        tm = basic_ops.gpu_alloc(self.mat, 5, 5)
-        func = theano.function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_COMPILE'
-        )
-        self._verify_alloc_count(func, 2)
-        self._verify_assert_count(func, 0)
-
-        # Optimization on dimshuffle without assert
-        temp_val = theano.config.experimental.local_alloc_elemwise_assert
-        theano.config.experimental.local_alloc_elemwise_assert = False
-        func = theano.function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 1)
-        self._verify_assert_count(func, 0)
-        theano.config.experimental.local_alloc_elemwise_assert = temp_val
-
-    def test_error(self):
-        t3fft = theano.tensor.tensor(dtype=self.dtype,
-                                     broadcastable=(False, False, True))
-        row = theano.tensor.row(dtype=self.dtype)
-        o = basic_ops.gpu_alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
-        func = theano.function(
-            [t3fft, row],
-            o,
-            mode=mode_with_gpu
-        )
-        self._verify_alloc_count(func, 0)
-        self._verify_assert_count(func, 1)
-        d = numpy.random.rand(5, 5, 1).astype(self.dtype)
-        r = numpy.random.rand(1, 5).astype(self.dtype)
-        func(d, r)
-

 def test_alloc_memset_0():
    i = tensor.iscalar()

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2767,12 +2767,27 @@ class Test_local_elemwise_alloc(unittest.TestCase):
    dtype = config.floatX

    def setUp(self):
-        self.vec = T.vector('vec', dtype=theano.config.floatX)
-        self.mat = T.matrix('mat', dtype=theano.config.floatX)
-        self.tens = T.tensor3('tens', dtype=theano.config.floatX)
+        self.fast_compile_mode = 'FAST_COMPILE'
+        self.fast_run_mode = 'FAST_RUN'
+
+        self.vec = T.vector('vec', dtype=self.dtype)
+        self.mat = T.matrix('mat', dtype=self.dtype)
+        self.tens = T.tensor3('tens', dtype=self.dtype)

        self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
        self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
+        self.alloc_w_dep_tens = T.alloc(
+            self.vec,
+            self.tens.shape[0],
+            self.tens.shape[1]
+        )
+        self.tv_wo_dep = T.alloc(self.vec, 5, 5)
+        self.tm_wo_dep = T.alloc(self.mat, 5, 5, 5)
+        self.s = T.iscalar('s')
+        self.tv_w_dep = T.alloc(self.vec, self.s, self.s)
+        self.tm_w_dep = T.alloc(self.mat, 5, 5, 5)
+        self.row = theano.tensor.row(dtype=self.dtype)
+        self.o = T.alloc(self.row, 5, 5)

    def _verify_alloc_count(self, f, count):
        assert(
@@ -2793,7 +2808,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_wo_dep + self.mat,
-            mode='FAST_COMPILE'
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2802,8 +2817,9 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_wo_dep + self.mat,
-            mode='FAST_RUN'
+            mode=self.fast_run_mode
        )
+        from theano.printing import debugprint as dp
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 1)

@@ -2811,7 +2827,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_w_dep + self.mat,
-            mode='FAST_COMPILE'
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2820,7 +2836,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        func = function(
            [self.vec, self.mat],
            self.alloc_w_dep + self. mat,
-            mode='FAST_RUN'
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 0)
@@ -2829,8 +2845,9 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # No optimization on dimshuffle with assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_COMPILE'
+            self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
+            #T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2838,8 +2855,9 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # Optimization on dimshuffle with assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_RUN'
+            #T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
+            self.alloc_wo_dep.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 1)
@@ -2847,12 +2865,8 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # No optimization on dimshuffle without assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(
-                self.vec,
-                self.tens.shape[0],
-                self.tens.shape[1]
-            ).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_COMPILE'
+            self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_compile_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
@@ -2860,52 +2874,51 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # Optimization on dimshuffle without assert
        func = function(
            [self.vec, self.tens],
-            T.alloc(
-                self.vec,
-                self.tens.shape[0],
-                self.tens.shape[1]
-            ).dimshuffle(0, 1, 'x') + self.tens,
-            mode='FAST_RUN'
+            self.alloc_w_dep_tens.dimshuffle(0, 1, 'x') + self.tens,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 0)

    def test_multi_input_single_alloc(self):
-        tv = T.alloc(self.vec, 5, 5)
-        tm = T.alloc(self.mat, 5, 5, 5)
+        # No optimization on dimshuffle with assert
        func = function(
            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_COMPILE'
+            self.tv_wo_dep + self.tm_wo_dep,
+            mode=self.fast_compile_mode
        )
-
        self._verify_alloc_count(func, 2)
        self._verify_assert_count(func, 0)

+        # Optimization on dimshuffle with assert
+        temp = self.tv_wo_dep + self.tm_wo_dep,
+        from theano.printing import debugprint as dp
+        import ipdb; ipdb.set_trace()
        func = function(
            [self.vec, self.mat],
-            tv + tm,
-            mode='FAST_RUN'
+            temp,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)

-        s = T.iscalar('s')
-        tv = T.alloc(self.vec, s, s)
-        tm = T.alloc(self.mat, 5, 5, 5)
+        # No optimization on dimshuffle without assert
+        #s = T.iscalar('s')
+        #tv = T.alloc(self.vec, s, s)
+        #tm = T.alloc(self.mat, 5, 5, 5)
        func = function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_COMPILE'
+            [self.vec, self.mat, self.s],
+            self.tv_w_dep + self.tm_w_dep,
+            mode=self.fast_compile_mode
        )
-
        self._verify_alloc_count(func, 2)
        self._verify_assert_count(func, 0)

+        # Optimization on dimshuffle without assert
        func = function(
-            [self.vec, self.mat, s],
-            tv + tm,
-            mode='FAST_RUN'
+            [self.vec, self.mat, self.s],
+            self.tv_w_dep + self.tm_w_dep,
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 1)
@@ -2913,12 +2926,13 @@ class Test_local_elemwise_alloc(unittest.TestCase):
    def test_error(self):
        t3fft = theano.tensor.tensor(dtype=self.dtype,
                                     broadcastable=(False, False, True))
-        row = theano.tensor.row(dtype=self.dtype)
-        o = T.alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
+        #row = theano.tensor.row(dtype=self.dtype)
+        #o = T.alloc(row, 5, 5).dimshuffle(0, 1, 'x') + t3fft
+        o = self.o.dimshuffle(0, 1, 'x') + t3fft
        func = function(
-            [t3fft, row],
+            [t3fft, self.row],
            o,
-            mode='FAST_RUN'
+            mode=self.fast_run_mode
        )
        self._verify_alloc_count(func, 0)
        self._verify_assert_count(func, 1)