Merge pull request #4496 from nouiz/gpu_opt

Gpu opt

Merge pull request #4496 from nouiz/gpu_opt
6d23147f · Pascal Lamblin · 5e501473 · 3a6a88c3 · 6d23147f · 6d23147f
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -225,12 +225,16 @@ class SeqOptimizer(Optimizer, list):
        callback_before = fgraph.execute_callbacks_time
        nb_node_before = len(fgraph.apply_nodes)
        sub_profs = []
+        nb_nodes = []
        for optimizer in self:
            try:
+                nb_nodes_before = len(fgraph.apply_nodes)
                t0 = time.time()
                sub_prof = optimizer.optimize(fgraph)
                l.append(float(time.time() - t0))
                sub_profs.append(sub_prof)
+                nb_nodes.append((nb_nodes_before,
+                                 len(fgraph.apply_nodes)))
                if fgraph.profile:
                    sub_validate_time.append(fgraph.profile.validate_time)
            except AssertionError:
@@ -249,7 +253,8 @@ class SeqOptimizer(Optimizer, list):
            validate_time = None
        callback_time = fgraph.execute_callbacks_time - callback_before
        return (self, l, validate_time, callback_time, nb_node_before,
-                len(fgraph.apply_nodes), sub_profs, sub_validate_time)
+                len(fgraph.apply_nodes), sub_profs, sub_validate_time,
+                nb_nodes)
    def __str__(self):
        return "SeqOpt(%s)" % list.__str__(self)
@@ -270,7 +275,7 @@ class SeqOptimizer(Optimizer, list):
    @staticmethod
    def print_profile(stream, prof, level=0):
        (opts, prof, validate_time, callback_time, nb_node_before,
-         nb_node_after, sub_profs, sub_validate_time) = prof
+         nb_node_after, sub_profs, sub_validate_time, nb_nodes) = prof
        blanc = ('    ' * level)
        print(blanc, "SeqOptimizer", end=' ', file=stream)
@@ -284,18 +289,19 @@ class SeqOptimizer(Optimizer, list):
        print(blanc, "  %.3fs for callback" % (callback_time), file=stream)
        print(blanc, "      %.3fs for fgraph.validate()" % (validate_time), file=stream)
        if level == 0:
-            print(blanc, "  time      - (name, class, index) - validate time", file=stream)
+            print(blanc, "  time      - (name, class, index, nodes before, nodes after) - validate time", file=stream)
        ll = []
        for opt in opts:
            if hasattr(opt, "__name__"):
-                ll.append((opt.__name__, opt.__class__.__name__,
+                name = opt.__name__
-                           opts.index(opt)))
            else:
-                ll.append((opt.name, opt.__class__.__name__,
+                name = opt.name
-                           opts.index(opt)))
+            idx = opts.index(opt)
-        lll = sorted(zip(prof, ll), key=lambda a: a[0])
+            ll.append((name, opt.__class__.__name__,
+                       idx) + nb_nodes[idx])
+        lll = sorted(zip(prof, ll, nb_nodes), key=lambda a: a[0])
-        for (t, opt) in lll[::-1]:
+        for (t, opt, nb_n) in lll[::-1]:
            # if t < 1:
            #    continue
            if sub_validate_time:

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -245,7 +245,8 @@ def local_cut_gpu_transfers(node):
            # host ->
            if isinstance(n2.op, GpuFromHost):
-                return [GpuFromHost(node.op.context_name)(n2.inputs[0])]
+                return [as_gpuarray_variable(n2.inputs[0],
+                                             node.op.context_name)]
            # gpuc ->
            if isinstance(n2.op, GpuToGpu):
@@ -464,7 +465,8 @@ def local_gpua_dimshuffle(node, context_name):
 def local_gpua_specifyShape(node, context_name):
    if isinstance(node.inputs[0].type, GpuArrayType):
        return
-    inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:]
+    inp = [as_gpuarray_variable(node.inputs[0], context_name)]
+    inp += node.inputs[1:]
    return tensor.specify_shape(*inp)
@@ -475,7 +477,7 @@ def local_gpua_shape(node, context_name):
    # always on the CPU.
    if isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return [GpuFromHost(context_name)(node.inputs[0]).shape]
+    return [as_gpuarray_variable(node.inputs[0], context_name).shape]
 def gpu_print_wrapper(op, cnda):
@@ -530,7 +532,7 @@ def local_gpu_pdbbreakpoint_op(node):
            elif output_goes_to_gpu:
                # The input should be transfered to the gpu
-                new_inputs.append(GpuFromHost(context_name)(inp))
+                new_inputs.append(as_gpuarray_variable(inp, context_name))
                input_transfered.append(True)
            else:
@@ -690,7 +692,8 @@ def local_gpua_careduce(node, context_name):
        # We need to have the make node called, otherwise the mask can
        # be None
        if (op is GpuCAReduceCPY or
-                gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])):
+                gvar.owner.op.supports_c_code([
+                    as_gpuarray_variable(x, context_name)])):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
@@ -730,7 +733,7 @@ def local_gpua_careduce(node, context_name):
                acc_dtype=getattr(node.op, 'acc_dtype', None))
            reshaped_x = x.reshape(tensor.stack(new_in_shp))
-            gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x)
+            gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
            gvar = greduce(gpu_reshaped_x)
            # We need to have the make node called, otherwise the mask can
            # be None

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -299,7 +299,7 @@ def local_gpu_elemwise_0(node):
                if all([i.type.dtype == 'float32' for i in node.inputs]):
                    # TODO: change this when fusion makes Elemwise with
                    # multiple outputs
-                    gpu_elemwise = new_op(*(gpu_from_host(i)
+                    gpu_elemwise = new_op(*(as_cuda_ndarray_variable(i)
                                            for i in node.inputs),
                                          return_list=True)
                # case 2 - it is still ok if some inputs were upcast to float32
@@ -312,7 +312,7 @@ def local_gpu_elemwise_0(node):
                    if [o.type for o in upcasted.outputs] ==\
                       [o.type for o in node.outputs]:
-                        new_inputs = [gpu_from_host(tensor.cast(i, 'float32'))
+                        new_inputs = [as_cuda_ndarray_variable(tensor.cast(i, 'float32'))
                                      for i in node.inputs]
                        gpu_elemwise = new_op(*new_inputs, return_list=True)
                    else:
@@ -1314,7 +1314,7 @@ def local_gpu_pdbbreakpoint_op(node):
            elif output_goes_to_gpu:
                # The input should be transfered to the gpu
-                new_inputs.append(gpu_from_host(inp))
+                new_inputs.append(as_cuda_ndarray_variable(inp))
                input_transfered.append(True)
            else:
@@ -1537,7 +1537,7 @@ def local_gpu_conv(node):
                                       img.shape[0], *op.imshp_logical)
                    img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
                                               img)
-                    img = gpu_from_host(img)
+                    img = as_cuda_ndarray_variable(img)
                    return ret(img, kern)
                return make_graph
@@ -1551,8 +1551,8 @@ def local_gpu_conv(node):
            if gpu_conv is None:
                return
            img, kern = host_input.owner.inputs
-            out = gpu_conv(gpu_from_host(img),
+            out = gpu_conv(as_cuda_ndarray_variable(img),
-                           gpu_from_host(kern))
+                           as_cuda_ndarray_variable(kern))
            out = tensor.patternbroadcast(out,
                                          node.outputs[0].broadcastable)
            out.tag.values_eq_approx = values_eq_approx_high_tol
@@ -1569,8 +1569,8 @@ def local_gpu_conv(node):
            gpu_conv = GpuConvOp_from_ConvOp(node.op)
            if gpu_conv is None:
                return
-            out = gpu_conv(gpu_from_host(img),
+            out = gpu_conv(as_cuda_ndarray_variable(img),
-                           gpu_from_host(kern))
+                           as_cuda_ndarray_variable(kern))
            out = tensor.patternbroadcast(
                host_from_gpu(out),
                node.outputs[0].broadcastable)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -155,13 +155,16 @@ def broadcast_like(value, template, fgraph, dtype=None):
    if template not in fgraph.variables:
        raise NotImplementedError('broadcast_like currently requires the '
                                  'template Variable to be in the fgraph already')
+    if dtype is None:
+        dtype = template.dtype
+    value = T.cast(value, dtype)
+    if value.type == template.type:
+        return value
    if hasattr(fgraph, 'shape_feature'):
        new_shape = fgraph.shape_feature.shape_of[template]
    else:
        new_shape = template.shape
-    if dtype is None:
+    rval = T.alloc(value, *new_shape)
-        dtype = template.dtype
-    rval = T.alloc(T.cast(value, dtype), *new_shape)
    # the template may have 1s in its shape without being broadcastable
    if rval.broadcastable != template.broadcastable:
        rval = T.unbroadcast(rval, *[i for i in xrange(rval.ndim)
@@ -234,6 +237,11 @@ def inplace_elemwise_optimizer_op(OP):
        else:
            update_outs = []
+        protected_inputs = [
+            f.protected for f in fgraph._features if
+            isinstance(f, theano.compile.function_module.Supervisor)]
+        protected_inputs = sum(protected_inputs, [])  # flatten the list
+        protected_inputs.extend(fgraph.outputs)
        for node in list(graph.io_toposort(fgraph.inputs, fgraph.outputs)):
            op = node.op
            # gpuarray GpuElemwise inherit from Elemwise
@@ -242,25 +250,39 @@ def inplace_elemwise_optimizer_op(OP):
            # If big graph and the outputs are scalar, do not make it
            # inplace.
            if (check_each_change != 1 and
-                all([getattr(o.type, 'ndim', -1) == 0
+                # If multiple outputs, they must all have the same size,
-                     for o in node.outputs])):
+                # so only check the first.
+                    getattr(node.outputs[0].type, 'ndim', -1) == 0):
                continue
+            if op.inplace_pattern:
+                # Maybe this isn't needed anymore, but I don't want to
+                # rish regression now. This case only happen if the
+                # original node add already some inplace patter and we
+                # still try to add more pattern.
                baseline = op.inplace_pattern
-            protected_inputs = [
-                f.protected for f in node.fgraph._features if
-                isinstance(f, theano.compile.function_module.Supervisor)]
-            protected_inputs = sum(protected_inputs, [])  # flatten the list
-            protected_inputs.extend(fgraph.outputs)
                candidate_outputs = [i for i in xrange(len(node.outputs))
                                     if i not in baseline]
                # node inputs that are Constant, already destroyed,
-            # fgraph protected inputs and fgraph outputs can't be used as inplace
+                # or fgraph protected inputs and fgraph outputs can't be used as
-            # target.
+                # inplace target.
                # Remove here as faster.
                candidate_inputs = [i for i in xrange(len(node.inputs))
                                    if i not in baseline.values() and
                                    not isinstance(node.inputs[i], Constant) and
+                                    # Is next line costly?
+                                    not fgraph.destroyers(node.inputs[i]) and
+                                    node.inputs[i] not in protected_inputs]
+            else:
+                baseline = []
+                candidate_outputs = list(range(len(node.outputs)))
+                # node inputs that are Constant, already destroyed,
+                # fgraph protected inputs and fgraph outputs can't be used as inplace
+                # target.
+                # Remove here as faster.
+                candidate_inputs = [i for i in xrange(len(node.inputs))
+                                    if not isinstance(node.inputs[i], Constant) and
                                    not fgraph.destroyers(node.inputs[i]) and
                                    node.inputs[i] not in protected_inputs]
@@ -2706,6 +2728,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
            val = T.switch(T.lt(sl2, 0), - len1 - 1, val)
            if sl1.step:
                val = T.switch(T.eq(sl1.step, 0), len1 + 1, val)
+            val = pre_greedy_local_optimizer(list_opt, val)
            return val
        else:
            # We are in the more complex case when we do not actually know
@@ -2730,6 +2753,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
            val = T.switch(T.lt(sl2, 0), - len1 - 1, val)
            if sl1.step:
                val = T.switch(T.eq(sl1.step, 0), len1 + 1, val)
+            val = pre_greedy_local_optimizer(list_opt, val)
            return val
    else:
        # We are deleaing with two slices that need to be put together