Merge pull request #2753 from mducoffe/ccw_2692

AllocEmpty

Merge pull request #2753 from mducoffe/ccw_2692
c7ca08bf · carriepl · 7858ffe2 · c43cbcd1 · c7ca08bf · c7ca08bf
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -26,7 +26,7 @@ from theano.sandbox.cuda.basic_ops import (
    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
-    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
+    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
@@ -570,6 +570,8 @@ def local_gpu_dot22(node):
 @local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
 def local_gpu_dot22scalar(node):
    """
+    Deprecated : _dot22scalar has been replace by gemm
+    see Dot22scalar for more details
    gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)

    dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
@@ -2290,6 +2292,15 @@ def gpuScanOptimization(node):
    return False


+@register_opt()
+@local_optimizer([tensor.AllocEmpty, gpu_from_host])
+def local_gpu_allocempty(node):
+    if (isinstance(node.op, tensor.AllocEmpty) and
+        node.op.dtype=="float32"):
+        return [host_from_gpu(GpuAllocEmpty()(*node.inputs))]
+    return False
+        
+
 optdb.register('gpu_scanOp_make_inplace',
               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
                                             gpu_flag=True),

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -382,7 +382,7 @@ def test_alloc_empty():
    assert out.shape == (2, 3)
    assert out.dtype == 'float32'

-    # Test that we do not merge them.
+    # Test that we merge them.
    f = theano.function([], [cuda.basic_ops.gpu_alloc_empty(2, 3),
                             cuda.basic_ops.gpu_alloc_empty(2, 3)])
    out = f()

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -154,6 +154,18 @@ def test_gpualloc():
    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])


+def test_gpuallocempty():
+
+    f_gpu = theano.function([], tensor.AllocEmpty('float32')(2,3),
+                        mode=mode_with_gpu)
+    l_gpu = f_gpu.maker.fgraph.toposort()
+
+    assert numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_gpu])
+
+    f_cpu = theano.function([], tensor.AllocEmpty('int32')(2,3))
+    l_cpu = f_cpu.maker.fgraph.toposort()
+    assert not numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu])    
+
 class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
    dtype = 'float32'


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -17,7 +17,7 @@ from theano.tensor import elemwise
 from theano.tensor.var import (AsTensorError, TensorVariable,
                               TensorConstant,
                               _tensor_py_operators)
-from theano.tensor.type import TensorType
+from theano.tensor.type import TensorType, values_eq_approx_always_true
 from theano.tensor.type_other import NoneConst
 from theano import scalar as scal
 from theano.compat import partial
@@ -592,7 +592,8 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                continue
            elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
                if isinstance(v.owner.inputs[0], Constant):
-                    return numpy.asarray(v.owner.inputs[0].data.shape[v.owner.op.i])
+                    return numpy.asarray(
+                        v.owner.inputs[0].data.shape[v.owner.op.i])
            # Don't act as the constant_folding optimization here as this
            # fct is used too early in the optimization phase.  This would
            # mess with the stabilization optimization and be too slow.
@@ -5467,3 +5468,84 @@ class Choose(Op):
        choice = inputs[1]
        # TODO reuse out?
        z[0] = numpy.choose(a, choice, mode=self.mode)
+
+
+class AllocEmpty(gof.Op):
+    """Implement Alloc on the cpu, but without initializing memory."""
+    __props__ = ("dtype",)
+
+    # specify the type of the data
+    def __init__(self, dtype):
+        assert isinstance(dtype, str)
+        self.dtype = dtype.lower()
+
+    def validate_shape(self, shape):
+        sh = [as_tensor_variable(s) for s in shape]
+        bcast = []
+        for s in sh:
+            if s.type.dtype[:3] not in ('int', 'uin'):
+                raise TypeError('Shape arguments must be integers', s)
+            # if s is constant 1, then we're broadcastable in that dim
+            try:
+                const_shp = get_scalar_constant_value(s)
+            except NotScalarConstantError:
+                const_shp = None
+            bcast.append(numpy.all(1 == const_shp))
+        otype = TensorType(dtype=self.dtype, broadcastable=bcast)
+        output = otype()
+        return sh, output
+
+    def make_node(self, *shape):
+        shape, output = self.validate_shape(shape)
+        output.tag.values_eq_approx = values_eq_approx_always_true
+        return Apply(self, shape, [output])
+
+    def perform(self, node, inputs, out_):
+        out, = out_
+        sh = tuple([int(i) for i in inputs])
+        if out[0] is None or out[0].shape != sh:
+            out[0] = numpy.empty(sh, dtype=self.dtype)
+
+    def c_code(self, node, name, inputs, out_, sub):
+        dtype = "NPY_"+self.dtype.upper()
+        out, = out_
+        fail = sub['fail']
+        shps = inputs
+        nd = len(shps)
+        str = "npy_intp dims[%(nd)s];\n" % locals()
+        for idx, sh in enumerate(shps):
+            str += "dims[%(idx)s] =" \
+                   "((npy_intp)((dtype_%(sh)s*)" \
+                   " PyArray_DATA(%(sh)s))[0]);\n" % locals()
+
+        # Validate that the output storage exists
+        str += "if(%(out)s==NULL\n" % locals()
+        for idx, sh in enumerate(shps):
+            str += "||PyArray_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]" % locals()
+
+        str += """){
+            /* Reference received to invalid output variable.
+            Decrease received reference's ref count and allocate new
+            output variable */
+            Py_XDECREF(%(out)s);
+            %(out)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s,
+                                                    dims,
+                                                    %(dtype)s,
+                                                    0);
+            if (!%(out)s)
+            {
+                PyErr_SetString(PyExc_MemoryError, "alloc failed");
+                %(fail)s;
+            }
+        }
+        """ % locals()
+        return str
+
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs]
+
+    def c_code_cache_version(self):
+        return (3,)
+
+    def do_constant_folding(self, node):
+        return False
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1824,7 +1824,6 @@ def local_dot22_to_ger_or_gemv(node):
            # x and y are both vectors so this might qualifies for a GER
            xv = x.dimshuffle(0)
            yv = y.dimshuffle(1)
-
            zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
            rval = ger(zeros, one, xv, yv)
            return [rval]
@@ -1832,19 +1831,19 @@ def local_dot22_to_ger_or_gemv(node):
            # x and y are both vectors so this qualifies for a sdot / ddot
            # TODO: Theano doesn't have a sdot, but gemv is better than _dot22
            xv = x.dimshuffle(1)
-            zeros = T.zeros([1], x.dtype)
+            zeros = T.AllocEmpty(x.dtype)(1)
            rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
            return [rval.dimshuffle('x', 0)]
        if xb[0] and not yb[0] and not yb[1]:
            # x is vector, y is matrix so try gemv
            xv = x.dimshuffle(1)
-            zeros = T.zeros([y.shape[1]], x.dtype)
+            zeros = T.AllocEmpty(x.dtype)(y.shape[1])
            rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
            return [rval.dimshuffle('x', 0)]
        if not xb[0] and not xb[1] and yb[1]:
            # x is matrix, y is vector, try gemv
            yv = y.dimshuffle(0)
-            zeros = T.zeros([x.shape[0]], dtype=x.dtype)
+            zeros = T.AllocEmpty(x.dtype)(x.shape[0])
            rval = gemv_no_inplace(zeros, one, x, yv, zero)
            return [rval.dimshuffle(0, 'x')]

@@ -2043,8 +2042,12 @@ def local_dot22_to_dot22scalar(node):
        a = T.cast(_as_scalar(m.owner.inputs[scalar_idx],
                              dtype=d.dtype), d.type.dtype)
        assert not a.type.ndim
-        dot = _dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)

+        z = T.AllocEmpty(d.owner.inputs[0].dtype)(d.owner.inputs[0].shape[0],
+                                                  d.owner.inputs[1].shape[1])
+        zero = T.as_tensor_variable(numpy.asarray(0, dtype=a.dtype))
+        dot = gemm(z, a, d.owner.inputs[0], d.owner.inputs[1], zero)
+        
        # The other inputs to the original node that were
        # neither part of the dot22 or this mul should be
        # factors in the returned "mul" node.
@@ -2079,11 +2082,16 @@ def local_dot22_to_dot22scalar(node):
    a = T.cast(i_scalar[scalar_idx], d.type.dtype)
    assert not a.type.ndim
    if len(o) == 0:
-        return [_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)]
+        z = T.AllocEmpty(d.owner.inputs[0].dtype)(d.owner.inputs[0].shape[0],
+                                                  d.owner.inputs[1].shape[1])
+        zero = T.as_tensor_variable(numpy.asarray(0, dtype=a.dtype))
+        return [gemm(z, a, d.owner.inputs[0], d.owner.inputs[1], zero)]
    else:
-        return [T.mul(_dot22scalar(d.owner.inputs[0],
-                                   d.owner.inputs[1], a), *o)]
-
+        z = T.AllocEmpty(d.owner.inputs[0].dtype)(d.owner.inputs[0].shape[0],
+                                                  d.owner.inputs[1].shape[1])
+        zero = T.as_tensor_variable(numpy.asarray(0, dtype=a.dtype))
+        return [T.mul(gemm(z, a, d.owner.inputs[0], d.owner.inputs[1],
+                      zero), *o)]
 # must happen after gemm as the gemm optimizer don't understant
 # dot22scalar and gemm give more speed up then dot22scalar
 blas_optdb.register('local_dot22_to_dot22scalar',

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -47,7 +47,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
        itensor3, Tile, switch, Diagonal, Diag,
        nonzero, flatnonzero, nonzero_values,
        stacklists, DimShuffle, hessian, ptp, power,
-        swapaxes, choose, Choose, NoneConst,
+        swapaxes, choose, Choose, NoneConst, AllocEmpty
        )

 from theano.tests import unittest_tools as utt
@@ -7558,6 +7558,15 @@ class T_Choose(utt.InferShapeTester):
                                # Op that should be removed from the graph.
                                self.op_class)

+def test_allocempty():
+    # Test that we allocated correctly
+    f = theano.function([], AllocEmpty("float32")(2, 3))
+    assert len(f.maker.fgraph.apply_nodes) == 1
+    out = f()
+    
+    assert out.shape == (2, 3)
+    assert out.dtype == 'float32'
+
 """

 if __name__ == '__main__':

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -875,28 +875,32 @@ def test_dot22scalar():
                    cst = theano.tensor.basic.constant(.2, dtype=dtype4)
                    cst2 = theano.tensor.basic.constant(.1, dtype=dtype4)

-                    def check_dot22scalar(func, len_topo_scalar=-1):
+                    def check_dot22scalar_gemm(func, len_topo_scalar=-1):
                        topo = func.maker.fgraph.toposort()
                        ops = [x.op for x in topo]
+                        classes = [type(x.op) for x in topo]
                        dtype4_upcast = theano.scalar.upcast(dtype4, dtype1,
                                                             dtype2)
+
                        if dtype1 == dtype2 == dtype3 == dtype4_upcast:
                            if len_topo_scalar > 0:
                                assert len(topo) == len_topo_scalar
-                            assert _dot22scalar in ops, (dtype1, dtype2,
+                            assert gemm_inplace in ops, (dtype1, dtype2,
                                                         dtype3, dtype4)
                        elif dtype1 == dtype2 == dtype4_upcast:
                            if not (len_topo_scalar > 0):
                                assert len(topo) == len_topo_scalar
-                                assert _dot22scalar in ops, (dtype1, dtype2,
+                                assert gemm_inplace in ops, (dtype1, dtype2,
                                                             dtype3, dtype4)
+                                assert not T.Elemwise in classes, (
+                                    dtype1, dtype2, dtype3, dtype4)
                            else:
                                # Currently there is a problem of
                                # optimization order The constant get
                                # upcasted to float64 before we try to
                                # merge it with the dot22 of
                                # float32. So this prevent the merge.
-                                assert _dot22scalar in ops or _dot22 in ops, (
+                                assert gemm_inplace in ops or _dot22 in ops, (
                                    dtype1, dtype2, dtype3, dtype4)

                        elif dtype1 == dtype2:
@@ -916,7 +920,7 @@ def test_dot22scalar():
                            f = theano.function([a, b], cst * T.dot(a, b),
                                                mode=mode_blas_opt)
                            topo = f.maker.fgraph.toposort()
-                            check_dot22scalar(f, 1)
+                            check_dot22scalar_gemm(f, 1)

                            f(av, bv)

@@ -925,7 +929,8 @@ def test_dot22scalar():
                                                cst * c * T.dot(a, b),
                                                mode=mode_blas_opt)
                            topo = f.maker.fgraph.toposort()
-                            check_dot22scalar(f, 2)
+                            check_dot22scalar_gemm(f, 5)
+                            #print (av.dtype, bv.dtype, cv.dtype)

                            f(av, bv, cv)

@@ -933,7 +938,7 @@ def test_dot22scalar():
                                            c * cst * T.dot(a, b),
                                            mode=mode_blas_opt)
                        topo = f.maker.fgraph.toposort()
-                        check_dot22scalar(f, 2)
+                        check_dot22scalar_gemm(f, 5)
                        f(av, bv, cv)

                        # Here, canonicalize also seems needed
@@ -943,7 +948,7 @@ def test_dot22scalar():
                                            cst2 * c * cst * T.dot(a, b),
                                            mode=m2)
                        topo = f.maker.fgraph.toposort()
-                        check_dot22scalar(f, 2)
+                        check_dot22scalar_gemm(f, 5)
                        f(av, bv, cv)

                        if dtype1 == dtype2 == dtype3:
@@ -951,7 +956,7 @@ def test_dot22scalar():
                                                c * cst * a * T.dot(a, b),
                                                mode=m2)
                            topo = f.maker.fgraph.toposort()
-                            check_dot22scalar(f, 2)
+                            check_dot22scalar_gemm(f, 5)
                            f(sv, sv, sv)

                            f = theano.function([a, b, c],
@@ -974,7 +979,7 @@ def test_dot22scalar():
                                                c * a * cst * T.dot(a, b),
                                                mode=m2)
                            topo = f.maker.fgraph.toposort()
-                            check_dot22scalar(f, 2)
+                            check_dot22scalar_gemm(f, 5)
                            f(sv, sv, sv)

                    cmp((3, 4), (4, 5), (3, 5))
@@ -994,7 +999,7 @@ def test_dot22scalar_cast():
    for scalar_int_type in T.int_dtypes:
        y = T.scalar(dtype=scalar_int_type)
        f = theano.function([A, y], T.dot(A, A) * y, mode=mode_blas_opt)
-        assert _dot22scalar in [x.op for x in f.maker.fgraph.toposort()]
+        assert gemm_inplace in [x.op for x in f.maker.fgraph.toposort()]
    A = T.fmatrix()
    for scalar_int_type in T.int_dtypes:
        y = T.scalar(dtype=scalar_int_type)
@@ -1002,7 +1007,7 @@ def test_dot22scalar_cast():
        if scalar_int_type in ['int32', 'int64']:
            assert _dot22 in [x.op for x in f.maker.fgraph.toposort()]
        else:
-            assert _dot22scalar in [x.op for x in f.maker.fgraph.toposort()]
+            assert gemm_inplace in [x.op for x in f.maker.fgraph.toposort()]


 def test_local_dot22_to_dot22scalar():