Started applying local_elemwise_alloc to GpuElemwise.

a4d2db75 · Dustin Webb · c4a2fd88 · a4d2db75 · a4d2db75 · a4d2db75
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -313,6 +313,13 @@ class GpuDimShuffle(GpuOp):
                                     " dimension.",
                                     (input_broadcastable, new_order))
+        # this is the list of the original dimensions that we keep
+        self.shuffle = [x for x in new_order if x != 'x']
+        # list of dimensions of the output that are broadcastable and were not
+        # in the original input
+        self.augment = [i for i, x in enumerate(new_order) if x == 'x']
        self.view_map = {0: [0]}
        self._rehash()
@@ -486,6 +493,17 @@ class GpuDimShuffle(GpuOp):
    def c_code_cache_version(self):
        return (1, 0)
+    def infer_shape(self, node, shapes):
+        ishp, = shapes
+        # transpose
+        rval = [ishp[i] for i in self.shuffle]
+        # augment
+        for augm in self.augment:
+            rval.insert(augm, 1)
+        return [rval]
 class GpuCAReduce(GpuOp):
    """GpuCAReduce is a Reduction along some dimensions by a scalar op.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1814,6 +1814,15 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
 optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
               'fast_run', 'inplace', 'gpu_inplace')
+gpu_local_elemwise_alloc = tensor.opt.register_specialize_device(
+    gof.local_optimizer([GpuElemwise])(
+        tensor.opt.local_elemwise_alloc_op(
+            GpuElemwise,
+            GpuAlloc,
+            GpuDimShuffle
+        )
+    )
+)
 @register_opt()
 @local_optimizer([tensor.alloc])

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
 import operator
 import sys
+import unittest
 import numpy
 # Skip test if cuda_ndarray is not available.
@@ -86,6 +87,74 @@ def test_gpualloc():
    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
+class Test_local_elemwise_alloc(unittest.TestCase):
+    dtype = config.floatX
+    def setUp(self):
+        self.vec = tensor.vector('vec', dtype=theano.config.floatX)
+        self.mat = tensor.matrix('mat', dtype=theano.config.floatX)
+        self.tens = tensor.tensor3('tens', dtype=theano.config.floatX)
+        self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2)
+        self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.vec.shape)
+    def _verify_alloc_count(self, f, count):
+        assert(
+            sum([isinstance(elem.op, basic_ops.GpuAlloc)
+                 for elem in f.maker.fgraph.toposort()
+                 if elem.op is not None]) == count
+        )
+    def _verify_assert_count(self, f, count):
+        assert(
+            sum([isinstance(elem.op, tensor.opt.Assert)
+                 for elem in f.maker.fgraph.toposort()
+                 if elem.op is not None]) == count
+        )
+    def test_remove_alloc_wo_dimshuffle(self):
+        # No optimization on alloc
+        from theano.printing import debugprint as dp
+        func = theano.function(
+            [self.vec, self.mat],
+            self.alloc_wo_dep + self.mat,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        # Optimization on alloc with assert
+        """
+        func = theano.function(
+            [self.vec, self.mat],
+            self.alloc_wo_dep + self.mat,
+            mode=mode_with_gpu
+        )
+        import ipdb; ipdb.set_trace()
+        self._verify_alloc_count(func, 0)
+        self._verify_assert_count(func, 1)
+        """
+        # No optimization on alloc without assert
+        func = theano.function(
+            [self.vec, self.mat],
+            self.alloc_w_dep + self.mat,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        # Optimization on alloc without assert
+        func = theano.function(
+            [self.vec, self.mat],
+            self.alloc_w_dep + self. mat,
+            mode=mode_with_gpu
+        )
+        import ipdb; ipdb.set_trace()
+        self._verify_alloc_count(func, 0)
+        self._verify_assert_count(func, 0)
 def test_alloc_memset_0():
    i = tensor.iscalar()
    z = numpy.zeros((1,), dtype='float32')