Merge pull request #1955 from nouiz/debugmode

Speed up Debugmode

Merge pull request #1955 from nouiz/debugmode
51964e4e · abergeron · 848848fc · f3904916 · 51964e4e · 51964e4e
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,6 +38,7 @@ script:
  - ulimit -a
  - echo $PART
  - theano-nose --with-timelimit -v $PART
+  - theano-cache list


 #after_script:

--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -138,6 +138,13 @@ default values.
        :return: the number of bytes taken by the object described by
            ``shape_info``.

+    .. method:: may_share_memory(a, b)
+
+        Optional. Only needed for DebugMode. Return True if the python
+        objects `a` and `b` could share memory. Return False
+        otherwise. It is used to debug when Ops didn't declare memory
+        aliaing between variables. Must be a static method.
+
 For each method, the *default* is what ``Type`` defines
 for you. So, if you create an instance of ``Type`` or an
 instance of a subclass of ``Type``, you

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -685,9 +685,10 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
    actually_inplace_outputs = []
    dmap = getattr(node.op, 'destroy_map', {})
    for oo, ii in dmap.iteritems():
-        out_var = storage_map[node.outputs[oo]][0]
+        var = node.outputs[oo]
+        out_var = storage_map[var][0]
        in_var = storage_map[node.inputs[ii[0]]][0]
-        if _may_share_memory(out_var, in_var):
+        if var.type.may_share_memory(out_var, in_var):
            actually_inplace_outputs.append(node.outputs[oo])

        if warn_input_not_reused and destroyed_res_list:
@@ -702,9 +703,11 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,

    vmap = getattr(node.op, 'view_map', {})
    for oo, ii in vmap.iteritems():
-        out_var = storage_map[node.outputs[oo]][0]
+        var = node.outputs[oo]
+        out_var = storage_map[var][0]
        in_var = storage_map[node.inputs[ii[0]]][0]
-        if _may_share_memory(out_var, in_var):
+        may_share = var.type.may_share_memory(out_var, in_var)
+        if may_share:
            actually_inplace_outputs.append(node.outputs[oo])

        if warn_input_not_reused:
@@ -717,7 +720,7 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
            if isinstance(node.op, OutputGuard):
                # This class is not in the final graph.
                continue
-            if not _may_share_memory(out_var, in_var):
+            if not may_share:
                _logger.warning("Optimization Warning: input idx %d marked "
                        "as viewed but new memory allocated by node '%s'",
                        ii[0], str(node))
@@ -766,7 +769,7 @@ def _check_viewmap(node, storage_map):

        for ii, inode in enumerate(node.inputs):

-            if _may_share_memory(outstorage, storage_map[inode][0]):
+            if inode.type.may_share_memory(outstorage, storage_map[inode][0]):

                nodeid = id(inode)
                bad_alias[nodeid] = ii
@@ -794,26 +797,18 @@ def _check_viewmap(node, storage_map):
                other_storage = storage_map[other_onode][0]
                # check to see if we share memory with this other output
                # this is not a problem if the node is not actually used
-                if _is_used_in_graph(other_onode) and \
-                        _may_share_memory(outstorage, other_storage):
+                if (_is_used_in_graph(other_onode) and
+                    other_onode.type.may_share_memory(outstorage,
+                                                      other_storage)):
                    raise BadViewMap(node, oi, outstorage,
                                     out_alias_idx=other_oi)


-def _may_share_memory(a, b):
-    from theano.misc.may_share_memory import may_share_memory
-    return may_share_memory(a, b, False)
-
-
-def _is_function_output(node):
+def _is_used_in_graph(var):
    """
-    Returns True if the node in question is the a final output of the graph
+    Returns True if `var` is used by another node in the graph
    """
-    return node.clients == [('output', 1)]
-
-
-def _is_used_in_graph(node):
-    return not(_is_function_output(node) or node.clients == [])
+    return not(var.clients == [('output', 1)] or var.clients == [])


 def _check_strides_match(a, b, warn_err, op):
@@ -1111,18 +1106,21 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
    # is less relevant.
    # Dimensions should be align by the innermost index, so we iterate
    # from the end of shapes.
-    max_ndim = 0
-    rev_out_broadcastable = []
-    for r in considered_outputs:
-        if isinstance(r.type, (TensorType, CudaNdarrayType)):
-            if max_ndim < r.ndim:
-                rev_out_broadcastable += [True] * (r.ndim - max_ndim)
-                max_ndim = r.ndim
-            assert len(rev_out_broadcastable) == max_ndim
-
-            for i, b in enumerate(r.broadcastable[::-1]):
-                rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
-    out_broadcastable = rev_out_broadcastable[::-1]
+    if ('strided' in prealloc_modes or
+        'wrong_size' in prealloc_modes or
+        'ALL' in prealloc_modes):
+        max_ndim = 0
+        rev_out_broadcastable = []
+        for r in considered_outputs:
+            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+                if max_ndim < r.ndim:
+                    rev_out_broadcastable += [True] * (r.ndim - max_ndim)
+                    max_ndim = r.ndim
+                assert len(rev_out_broadcastable) == max_ndim
+
+                for i, b in enumerate(r.broadcastable[::-1]):
+                    rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
+        out_broadcastable = rev_out_broadcastable[::-1]

    if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
        check_ndim = config.DebugMode.check_preallocated_output_ndim

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -677,6 +677,11 @@ class CLinker(link.Linker):
                raise NotImplementedError("%s cannot produce C code" % op)
            assert isinstance(behavior, basestring), (
                str(node.op) + " didn't return a string for c_code")
+            # To help understand what is following. It help read the c code.
+            # This prevent different op that generate the same c code
+            # to be merged, I suppose this won't happen...
+            behavior = ("// Op class " + node.op.__class__.__name__ + "\n" +
+                        behavior)

            try:
                cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -218,6 +218,7 @@ if __name__ == "__main__":

        GTX Titan Black   0.05s
        GTX Titan(D15U-50)       0.06s  0.06s  don't work
+        GTX 780                  0.06s
        GTX 680                  0.11s  0.12s  0.154s               0.218s
        GTX 580                  0.16s  0.16s  0.164s               0.203s
        GTX 480                  0.19s  0.19s  0.192s               0.237s 0.27s

--- a/theano/misc/may_share_memory.py
+++ b/theano/misc/may_share_memory.py
@@ -15,12 +15,14 @@ try:
    def _is_sparse(a):
        return scipy.sparse.issparse(a)
 except ImportError:
-    #scipy not imported, their can be only ndarray and cudandarray
+    # scipy not imported, their can be only ndarray and cudandarray
    def _is_sparse(a):
        return False

 from theano.sandbox import cuda
 if cuda.cuda_available:
+    from theano.sandbox.cuda.type import CudaNdarrayType
+
    def _is_cuda(a):
        return isinstance(a, cuda.CudaNdarray)
 else:
@@ -40,13 +42,19 @@ else:
 def may_share_memory(a, b, raise_other_type=True):
    a_ndarray = isinstance(a, numpy.ndarray)
    b_ndarray = isinstance(b, numpy.ndarray)
-    a_sparse = _is_sparse(a)
-    b_sparse = _is_sparse(b)
+    if a_ndarray and b_ndarray:
+        return TensorType.may_share_memory(a, b)
    a_cuda = _is_cuda(a)
    b_cuda = _is_cuda(b)
+    if a_cuda and b_cuda:
+        return CudaNdarrayType.may_share_memory(a, b)
    a_gpua = _is_gpua(a)
    b_gpua = _is_gpua(b)
+    if a_gpua and b_gpua:
+        return gpuarray.pygpu.gpuarray.may_share_memory(a, b)

+    a_sparse = _is_sparse(a)
+    b_sparse = _is_sparse(b)
    if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or
        not(b_ndarray or b_sparse or b_cuda or b_gpua)):
        if raise_other_type:
@@ -54,13 +62,6 @@ def may_share_memory(a, b, raise_other_type=True):
                            " and scipy.sparse, CudaNdarray or GpuArray type")
        return False

-    if a_ndarray and b_ndarray:
-        return TensorType.may_share_memory(a, b)
-    if a_cuda and b_cuda:
-        from theano.sandbox.cuda.type import CudaNdarrayType
-        return CudaNdarrayType.may_share_memory(a, b)
-    if a_gpua and b_gpua:
-        return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
    if a_cuda or b_cuda or a_gpua or b_gpua:
        return False
    return SparseType.may_share_memory(a, b)
--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
 # This is work in progress
-from theano import Op, Apply
+from theano import Op, Apply, tensor
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available, GpuOp

@@ -7,7 +7,8 @@ from theano.sandbox.neighbours import Images2Neibs

 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
+    from theano.sandbox.cuda.basic_ops import (
+        as_cuda_ndarray_variable, host_from_gpu, gpu_from_host)
    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt


@@ -21,13 +22,16 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
        self.mode = mode

    def make_node(self, ten4, neib_shape, neib_step):
-        assert ten4.dtype == 'float32'
-        if not isinstance(ten4.type, CudaNdarrayType):
-            raise TypeError('ten4 must be cudandarray', ten4)
+        ten4 = as_cuda_ndarray_variable(ten4)
+        neib_shape = tensor.as_tensor_variable(neib_shape)
+        neib_step = tensor.as_tensor_variable(neib_step)

        assert ten4.ndim == 4
+        assert ten4.dtype == 'float32'
        assert neib_shape.ndim == 1
        assert neib_step.ndim == 1
+        assert "int" in neib_shape.dtype
+        assert "int" in neib_step.dtype

        return Apply(self, [ten4, neib_shape, neib_step],
                     [CudaNdarrayType(broadcastable=(False, False),

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
@@ -29,6 +29,9 @@ class GpuImages2Neibs(Images2Neibs, Op):
        self.mode = mode

    def make_node(self, ten4, neib_shape, neib_step):
+        ten4 = as_gpuarray_variable(ten4)
+        neib_shape = T.as_tensor_variable(neib_shape)
+        neib_step = T.as_tensor_variable(neib_step)

        assert ten4.ndim == 4
        assert neib_shape.ndim == 1
@@ -36,10 +39,6 @@ class GpuImages2Neibs(Images2Neibs, Op):
        assert "int" in neib_shape.dtype
        assert "int" in neib_step.dtype

-        ten4 = as_gpuarray_variable(ten4)
-        neib_shape = T.as_tensor_variable(neib_shape)
-        neib_step = T.as_tensor_variable(neib_step)
-
        return Apply(self, [ten4, neib_shape, neib_step],
                     [GpuArrayType(broadcastable=(False, False),
                                   dtype=ten4.type.dtype)()])

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -145,6 +145,13 @@ class Scalar(Type):
        self.dtype = dtype
        self.dtype_specs()  # error checking

+    @staticmethod
+    def may_share_memory(a, b):
+        # This class represent basic c type, represented in python
+        # with numpy.scalar. They are read only. So from python, they
+        # can never share memory.
+        return False
+
    def filter(self, data, strict=False, allow_downcast=None):
        py_type = self.dtype_specs()[0]
        if strict and not isinstance(data, py_type):