Added fft opts to new opt, fixed broadcasting test, few cleanups

3bd237f5 · sentient07 · ebcf5615 · 3bd237f5 · 3bd237f5 · 3bd237f5
--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
 from theano.gpuarray import (basic_ops, GpuArrayType)

 import theano.tensor.fft
-from .opt import register_opt, op_lifter
+from .opt import register_opt, op_lifter, register_opt2

 try:
    import pygpu
@@ -373,10 +373,12 @@ def _unitary(norm):
 if scikits_cuda_available:
    @register_opt('fast_compile')
    @op_lifter([theano.tensor.fft.RFFTOp])
-    def local_curfft_op(node, context_name):
+    @register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
+    def local_gpua_curfft_op(node, context_name):
        return curfft_op

    @register_opt('fast_compile')
    @op_lifter([theano.tensor.fft.IRFFTOp])
-    def local_cuirfft_op(node, context_name):
+    @register_opt2([theano.tensor.fft.IRFFTOp], 'fast_compile')
+    def local_gpua_cuirfft_op(node, context_name):
        return cuirfft_op
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -112,11 +112,11 @@ def register_opt2(tracks, *tags, **kwargs):

    Parameters
    ----------
-    tracks : Op
+    tracks : List of Op class Or Op instance or None
        The Node's Op to which optimization is being applied.

    tags : String
-        The tag optimization mode to which the optimizer will be registered.
+        The optimization tag to which the optimizer will be registered.

    '''
    def f(local_opt):
@@ -180,10 +180,10 @@ def op_lifter(OP, cuda_only=False):
                        context_name = i.owner.inputs[0].type.context_name
                        replace = True
                        break
-                clients = [c for o in node.outputs for c in o.clients]

                if not replace:
                    # We replace if *all* clients are on the GPU
+                    clients = [c for o in node.outputs for c in o.clients]
                    replace = len(clients) != 0
                    for c, idx in clients:
                        if (c == 'output' or
@@ -273,7 +273,7 @@ class GraphToGPU(NavigatorOptimizer):

    Parameters
    ----------
-    local_optimizers_all : List or Set
+    local_optimizers_all : List or SortedSet
        The local optimizations to apply to a node.
    local_optimizers_map : Dict
        Dictionary object containing the mapping of Op to list of
@@ -349,7 +349,6 @@ class GraphToGPU(NavigatorOptimizer):
                         self.local_optimizers_map.get(type(c.op), []))):
                        move_to_GPU = True
            new_ops = None
-            outputs = []
            # Apply the lifter
            if move_to_GPU:
                for lopt in (self.local_optimizers_map.get(node.op, []) +
@@ -365,24 +364,23 @@ class GraphToGPU(NavigatorOptimizer):
                        if new_ops:
                            process_count[lopt] += 1
                            break
-            if not new_ops:
-                newnode = node.clone_with_new_inputs([mapping.get(i)
-                                                      for i in node.inputs])
+            outputs = []
+
+            if isinstance(new_ops, theano.Op):
+                outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
+            elif not new_ops:
+                newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
                outputs = newnode.outputs
            elif isinstance(new_ops, (tuple, list)):
-                outputs = []
-                for o in new_ops:
-                    outputs.append(o)
+                outputs = new_ops
            elif isinstance(new_ops, theano.Variable):
                outputs = [new_ops]
-            else:
-                outputs = new_ops(*[mapping[i] for i in node.inputs],
-                                  return_list=True)

            if new_ops:
                node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))

            for new_o, old_o in zip(outputs, node.outputs):
+                assert len(outputs) == len(node.outputs)
                mapping[old_o] = new_o

        new_nodes = []
@@ -473,15 +471,6 @@ class GraphToGPU(NavigatorOptimizer):
                                          prof2[0].local_optimizers_map)
        new_opt = GraphToGPU(local_optimizers, local_optimizers_map)

-        def merge_list(l1, l2):
-            l = copy.copy(l1)
-            for idx, nb in enumerate(l2):
-                if idx < len(l):
-                    l[idx] += nb
-                else:
-                    l.append(nb)
-            return l
-
        toposort_timing = prof1[1] + prof2[1]
        time_opts = merge_dict(prof1[2], prof2[2])
        node_created = merge_dict(prof1[3], prof2[3])
@@ -583,7 +572,7 @@ def local_gpua_alloc(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.AllocEmpty])
 @register_opt2([tensor.AllocEmpty], 'fast_compile')
-def local_gpua_allocempty(op, context_name, inputs, outputs):
+def local_gpua_alloc_empty(op, context_name, inputs, outputs):
    # We use _props_dict() to make sure that the GPU op know all the
    # CPU op props.
    return gpu_alloc_empty(context_name, **op._props_dict())
@@ -949,7 +938,7 @@ def local_gpua_subtensor_graph(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.IncSubtensor])
 @register_opt2([tensor.IncSubtensor], 'fast_compile')
-def local_gpua_incsubtensor(op, context_name, inputs, outputs):
+def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
    op = GpuIncSubtensor(op.idx_list, op.inplace,
                         op.set_instead_of_inc,
                         op.destroyhandler_tolerate_aliased)
@@ -1229,11 +1218,11 @@ def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):
 def local_gpua_assert(op, context_name, inputs, outputs):
    if isinstance(inputs[0].type, GpuArrayType):
        return
-    return local_assert_graph(op, context_name, inputs, outputs)
+    return local_gpua_assert_graph(op, context_name, inputs, outputs)


 @register_opt2([theano.tensor.opt.Assert], 'fast_compile')
-def local_assert_graph(op, context_name, inputs, outputs):
+def local_gpua_assert_graph(op, context_name, inputs, outputs):
    return [op(as_gpuarray_variable(inputs[0], context_name),
               *inputs[1:])]

@@ -1253,7 +1242,7 @@ theano.tensor.nnet.conv2d()
 @register_opt('fast_compile')
 @op_lifter([SparseBlockGemv])
 @register_opt2([SparseBlockGemv], 'fast_compile')
-def local_gpua_lift_sparseblockgemv(op, context_name, inputs, outputs):
+def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
    if op.inplace:
        return gpu_sparse_block_gemv_inplace
    else:
@@ -1263,7 +1252,7 @@ def local_gpua_lift_sparseblockgemv(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
 @op_lifter([SparseBlockOuter])
 @register_opt2([SparseBlockOuter], 'fast_compile')
-def local_gpua_lift_sparseblockouter(op, context_name, inputs, outputs):
+def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
    if op.inplace:
        return gpu_sparse_block_outer_inplace
    else:
@@ -1289,7 +1278,7 @@ def local_inplace_sparseblockouter(node):
 @op_lifter([AbstractConv2d,
            AbstractConv2d_gradWeights,
            AbstractConv2d_gradInputs])
-def local_gpua_lift_abstractconv2d(op, context_name, inputs, outputs):
+def local_gpua_abstractconv2d(op, context_name, inputs, outputs):
    if isinstance(outputs[0].type, GpuArrayType):
        # Don't handle this node here, it's already on the GPU.
        return

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -187,7 +187,7 @@ def test_local_gpualloc_empty():
    ii = theano.tensor.iscalar()

    # Test with vector
-    # Should not be moved as the only client is the uutput
+    # Should not be moved as the only client is the output
    a = tensor.AllocEmpty('float32')(i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -1553,7 +1553,7 @@ class MRG_RandomStreams(object):


 @register_opt2([mrg_uniform], 'fast_compile')
-def local_gpua_mrg1(op, context_name, inputs, outputs):
+def local_gpua_mrg_graph(op, context_name, inputs, outputs):
    if (type(op) == mrg_uniform and
            isinstance(inputs[0].type, GpuArrayType)):
        outs = GPUA_mrg_uniform.new(inputs[0],
@@ -1566,9 +1566,8 @@ def local_gpua_mrg1(op, context_name, inputs, outputs):
 @register_gpua('fast_compile')
 @local_optimizer([mrg_uniform])
 def local_gpua_mrg(node):
-    # TODO : need description for function
    context_name = infer_context_name(*node.inputs)
-    return local_gpua_mrg1(node.op, context_name, node.inputs, node.outputs)
+    return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)


 MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -7003,7 +7003,7 @@ class T_get_scalar_constant_value(unittest.TestCase):
        assert get_scalar_constant_value(s) == 3
        s = opt.Shape_i(1)(c)
        assert get_scalar_constant_value(s) == 4
-        d = theano.tensor.constant(numpy.random.rand(1, 1))
+        d = theano.shared(numpy.random.randn(1,1), broadcastable=(True, True))
        f = theano.tensor.basic.ScalarFromTensor()(opt.Shape_i(0)(d))
        assert get_scalar_constant_value(f) == 1