Merge branch 'master' of git://github.com/Theano/Theano into scan

d0cbf05e · Laurent Dinh · c8f2711a · 8cc9395f · d0cbf05e · d0cbf05e
--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1438,8 +1438,12 @@ def get_gcc_shared_library_arg():
 def std_include_dirs():
-    return (numpy.distutils.misc_util.get_numpy_include_dirs()
+    numpy_inc_dirs = numpy.distutils.misc_util.get_numpy_include_dirs()
-            + [distutils.sysconfig.get_python_inc()])
+    py_inc = distutils.sysconfig.get_python_inc()
+    py_plat_spec_inc = distutils.sysconfig.get_python_inc(plat_specific=True)
+    python_inc_dirs = ([py_inc] if py_inc == py_plat_spec_inc
+                       else [py_inc, py_plat_spec_inc])
+    return numpy_inc_dirs + python_inc_dirs
 def std_lib_dirs_and_libs():
@@ -1713,7 +1717,7 @@ class GCC_compiler(object):
                                    continue
                                mj, mn, patch = [int(vp) for vp in version]
                                if (((mj, mn) == (4, 6) and patch < 4) or
-                                        ((mj, mn) == (4, 7) and patch < 3) or
+                                        ((mj, mn) == (4, 7) and patch <= 3) or
                                        ((mj, mn) == (4, 8) and patch < 1)):
                                    new_flags[i] = p.rstrip('-avx')

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
+import operator
 import sys
 import numpy
@@ -213,20 +214,29 @@ def test_huge_elemwise_fusion():
    """
    shape = (2, 3, 4, 5, 6)
    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
-    vars = [tensor.tanh(ttype) for x in range(7)]
+    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size']
-    f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
+    if gpu_ptr_size == 8:
-                     vars[5] - vars[6]], mode=mode_with_gpu)
+        nb_in = 7
+        len_topo = 10
+    elif gpu_ptr_size == 4:
+        nb_in = 8
+        len_topo = 11
+    else:
+        raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size)
+    vars = [tensor.tanh(ttype) for x in range(nb_in)]
+    f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    #theano.printing.debugprint(f)
    #for i, node in enumerate(topo):
    #    print >> sys.stdout, i, node
-    assert len(topo) == 10
+    assert len(topo) == len_topo
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
-    assert isinstance(topo[7].op.scalar_op, theano.scalar.basic.Sub)
+    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
-    assert isinstance(topo[8].op.scalar_op, theano.scalar.basic.Composite)
+    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
    #let debugmode catch errors
    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
+    f(*[gen() for i in range(nb_in)])
    # Test the case where we can't put the computation on the gpu! their is too
    # many dimensions to the input to have 2 inputs to the op!

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -909,7 +909,22 @@ class UnaryScalarOp(ScalarOp):
            node.inputs[0].type != node.outputs[0].type):
            raise theano.gof.utils.MethodNotDefined()
-        dtype = node.inputs[0].dtype
+        dtype = node.inputs[0].type.dtype_specs()[1]
+        fct_call = self.c_code_contiguous_raw(dtype, 'n', 'x', 'z')
+        return """
+{
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct_call)s;
+}
+        """ % locals()
+    def c_code_contiguous_raw(self, dtype, n, i, o):
+        if not config.lib.amdlibm:
+            raise theano.gof.utils.MethodNotDefined()
+        if dtype.startswith('npy_'):
+            dtype = dtype[4:]
        if dtype == 'float32' and self.amd_float32 is not None:
            dtype = 'float'
            fct = self.amd_float32
@@ -918,12 +933,7 @@ class UnaryScalarOp(ScalarOp):
            fct = self.amd_float64
        else:
            raise theano.gof.utils.MethodNotDefined()
-        return """
+        return "%(fct)s(%(n)s, %(i)s, %(o)s)" % locals()
-        npy_intp n = PyArray_SIZE(%(z)s);
-        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
-        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
-        %(fct)s(n, x, z);
-        """ % locals()
 class BinaryScalarOp(ScalarOp):

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -173,12 +173,9 @@ SOMEPATH/Canopy_64bit/User/lib/python2.7/site-packages/numpy/distutils/system_in
  warnings.warn('Specified path %s is invalid.' % d)
 """
            #I'm not able to remove all printed stuff
-            with_context = warnings.catch_warnings(record=True)
+            with warnings.catch_warnings(record=True):
-            with_context.__enter__()
+                numpy.distutils.system_info.system_info.verbosity = 0
-            try:
                blas_info = numpy.distutils.system_info.get_info("blas_opt")
-            finally:
-                with_context.__exit__(None, None, None)
        # If we are in a EPD installation, mkl is available
        if "EPD" in sys.version:

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -95,7 +95,7 @@ class SoftmaxWithBias(gof.Op):
        return ['<iostream>', '<cmath>']
    @staticmethod
-    def c_code_template():
+    def c_code_template(dtype):
        # this implementation was lifted from
        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
@@ -107,6 +107,10 @@ class SoftmaxWithBias(gof.Op):
        #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
        init_decl = """
        npy_intp* Nx = PyArray_DIMS(%(x)s);
+        npy_intp Sx = 0;
+        npy_intp Sb = 0;
+        npy_intp Ssm = 0;
        if (PyArray_NDIM(%(x)s) != 2)
        {
@@ -151,6 +155,10 @@ class SoftmaxWithBias(gof.Op):
                %(fail)s
            }
        }
+        Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
+        Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
+        Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
        """
        begin_row_loop = """
@@ -163,9 +171,7 @@ class SoftmaxWithBias(gof.Op):
            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
            const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
-        """
-        inside_row_loop = """
            npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
            npy_intp Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
@@ -182,6 +188,9 @@ class SoftmaxWithBias(gof.Op):
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }
+        """
+        inside_row_loop = """
            for (j = 0; j < Nx[1]; ++j)
            {
                dtype_%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
@@ -201,6 +210,42 @@ class SoftmaxWithBias(gof.Op):
        """
+        # Get the vectorized version of exp if it exist
+        try:
+            vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
+                                                              "Nx[1]", "sm_i", "sm_i")
+            inside_row_loop_contig = """
+            for (j = 0; j < Nx[1]; ++j)
+            {
+                dtype_%%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
+                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
+                dtype_%%(sm)s sm_ij = row_ij - row_max;
+                //std::cout << "3 " << j << " " << sm_ij << "\\n";
+                sm_i[j * Ssm] = sm_ij;
+            }
+            %(vec_exp)s;
+            for (j = 0; j < Nx[1]; ++j)
+            {
+                sum += sm_i[j * Ssm];
+            }
+            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
+            double sum_inv = 1.0 / sum;
+            for (j = 0; j < Nx[1]; ++j)
+            {
+                sm_i[j * Ssm] *= sum_inv;
+            }
+        """ % locals()
+            inside_row_loop = """
+            if(Ssm == 1){
+                %(inside_row_loop_contig)s
+            }else{
+                %(inside_row_loop)s
+            }
+            """ % locals()
+        except theano.gof.utils.MethodNotDefined:
+            pass
        end_row_loop = """
        }
        """
@@ -210,12 +255,13 @@ class SoftmaxWithBias(gof.Op):
    def c_code(self, node, name, inp, out, sub):
        x, b = inp
        sm, = out
-        code_template = ''.join(self.c_code_template())
+        code_template = ''.join(self.c_code_template(
+            node.inputs[0].type.dtype_specs()[1]))
        return code_template % dict(locals(), **sub)
    @staticmethod
    def c_code_cache_version():
-        return (6,)
+        return (8,)
 softmax_with_bias = SoftmaxWithBias()
@@ -384,7 +430,7 @@ class Softmax(gof.Op):
        return ['<iostream>', '<cmath>']
    @staticmethod
-    def c_code_template():
+    def c_code_template(dtype):
        # this implementation was lifted from
        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
@@ -396,6 +442,8 @@ class Softmax(gof.Op):
        #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
        init_decl = """
        npy_intp* Nx = PyArray_DIMS(%(x)s);
+        npy_intp Sx1 = 0;
+        npy_intp Ssm1 = 0;
        if (PyArray_NDIM(%(x)s) != 2)
        {
@@ -413,7 +461,7 @@ class Softmax(gof.Op):
            || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
            || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
        {
-            if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
+            Py_XDECREF(%(sm)s);
            %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
                                                       type_num_%(x)s);
            if(!%(sm)s) {
@@ -422,6 +470,8 @@ class Softmax(gof.Op):
                %(fail)s
            }
        }
+        Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
+        Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
        """
        begin_row_loop = """
@@ -433,11 +483,6 @@ class Softmax(gof.Op):
            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
-        """
-        inside_row_loop = """
-            npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
-            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
            size_t row_max_j=0;
            dtype_%(sm)s row_max = x_i[0];
@@ -445,46 +490,82 @@ class Softmax(gof.Op):
            // Get the maximum value of the row
            for (j = 1; j < Nx[1]; ++j)
            {
-                dtype_%(sm)s row_ij = x_i[j * Sx] ;
+                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
                //std::cout << "1 " << row_ij << "\\n";
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }
+        """
+        inside_row_loop = """
            for (j = 0; j < Nx[1]; ++j)
            {
-                dtype_%(sm)s row_ij = x_i[j * Sx] ;
+                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
                dtype_%(sm)s sm_ij = exp(row_ij - row_max);
                //std::cout << "3 " << j << " " << sm_ij << "\\n";
                sum += sm_ij;
-                sm_i[j * Ssm] = sm_ij;
+                sm_i[j * Ssm1] = sm_ij;
            }
            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
            double sum_inv = 1.0 / sum;
            for (j = 0; j < Nx[1]; ++j)
            {
-                sm_i[j * Ssm] *= sum_inv;
+                sm_i[j * Ssm1] *= sum_inv;
            }
        """
+        # Get the vectorized version of exp if it exist
+        try:
+            vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
+                                                              "Nx[1]", "sm_i", "sm_i")
+            inside_row_loop_contig = """
+            for (j = 0; j < Nx[1]; ++j)
+            {
+                sm_i[j * Ssm1] = x_i[j * Sx1] - row_max;
+            }
+            %(vec_exp)s;
+            for (j = 0; j < Nx[1]; ++j)
+            {
+                sum += sm_i[j * Ssm1];
+            }
+            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
+            double sum_inv = 1.0 / sum;
+            for (j = 0; j < Nx[1]; ++j)
+            {
+                sm_i[j * Ssm1] *= sum_inv;
+            }
+            """ % locals()
+            inside_row_loop = """
+            if(Ssm1 == 1){
+                %(inside_row_loop_contig)s
+            }else{
+                %(inside_row_loop)s
+            }
+            """ % locals()
+        except theano.gof.utils.MethodNotDefined:
+            pass
        end_row_loop = """
        }
        """
        return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
    def c_code(self, node, name, inp, out, sub):
        x, = inp
        sm, = out
-        code_template = ''.join(self.c_code_template())
+        code_template = ''.join(self.c_code_template(
+            node.inputs[0].type.dtype_specs()[1]))
        return code_template % dict(locals(), **sub)
    @staticmethod
    def c_code_cache_version():
-        return (1,)
+        return (3,)
 softmax = Softmax()
@@ -863,7 +944,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        return ['<iostream>', '<cmath>']
    @staticmethod
-    def c_code_template():
+    def c_code_template(dtype):
        # this implementation was lifted from
        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
@@ -874,7 +955,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
        (init_decl, begin_row_loop, inside_row_loop, end_row_loop) = \
-                SoftmaxWithBias.c_code_template()
+                SoftmaxWithBias.c_code_template(dtype)
        return (init_decl,
                """
        if (PyArray_NDIM(%(y_idx)s) != 1)
@@ -947,7 +1028,8 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        nll, sm, am = out
        y_idx_type = node.inputs[2].type.dtype_specs()[1]
        am_type = y_idx_type
-        code_template = ''.join(self.c_code_template())
+        dtype = node.inputs[0].type.dtype_specs()[1]
+        code_template = ''.join(self.c_code_template(dtype))
        return code_template % dict(locals(), **sub)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -587,7 +587,7 @@ class MakeVector(T.Op):
            out[0][...] = inputs
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
    def c_code(self, node, name, inp, out_, sub):
        out, = out_
@@ -604,7 +604,7 @@ class MakeVector(T.Op):
        ret = """
        npy_intp dims[1];
        dims[0] = %(out_shape)s;
-        if(!%(out)s || PyArray_DIMS(%(out)s)[0] == %(out_shape)s){
+        if(!%(out)s || PyArray_DIMS(%(out)s)[0] != %(out_shape)s){
            Py_XDECREF(%(out)s);
            %(out)s = (PyArrayObject*)PyArray_EMPTY(1, dims, %(out_dtype)s, 0);
        }

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -6736,6 +6736,17 @@ class TestTensorInstanceMethods(unittest.TestCase):
        # Test equivalent advanced indexing
        assert_array_equal(X[:,indices].eval({X: x}), x[:,indices])
+    def test_cumsum(self):
+        X, _ = self.vars
+        x, _ = self.vals
+        assert_array_equal(X.cumsum().eval({X: x}), x.cumsum())
+    def test_cumprod(self):
+        X, _ = self.vars
+        x, _ = self.vals
+        assert_array_equal(X.cumprod().eval({X: x}), x.cumprod())
 def test_norm():
    x = theano.tensor.vector('x')
    n = x.norm(2)

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -11,6 +11,7 @@ from theano.tensor.utils import hash_from_ndarray
 from theano.tensor.type import TensorType
 class AsTensorError(TypeError):
    """Raised when as_tensor_variable isn't able to create a
    TensorVariable.
@@ -509,13 +510,11 @@ class _tensor_py_operators:
    def sort(self, axis=-1, kind='quicksort', order=None):
        """See `theano.tensor.sort`"""
-        from theano.tensor.sort import sort
+        return theano.tensor.sort(self, axis, kind, order)
-        return sort(self, axis, kind, order)
    def argsort(self, axis=-1, kind='quicksort', order=None):
        """See `theano.tensor.argsort`"""
-        from theano.tensor.sort import argsort
+        return theano.tensor.argsort(self, axis, kind, order)
-        return argsort(self, axis, kind, order)
    def clip(self, a_min, a_max):
        "Clip (limit) the values in an array."
@@ -529,16 +528,14 @@ class _tensor_py_operators:
    def repeat(self, repeats, axis=None):
        """See `theano.tensor.repeat`"""
-        from theano.tensor.extra_ops import repeat
+        return theano.tensor.extra_ops.repeat(self, repeats, axis)
-        return repeat(self, repeats, axis)
    def round(self, mode="half_away_from_zero"):
        """See `theano.tensor.round`"""
        return theano.tensor.basic.round(self, mode)
    def trace(self):
-        from theano.sandbox.linalg import trace
+        return theano.sandbox.linalg.trace(self)
-        return trace(self)
    # TO TRUMP NUMPY OPERATORS
    __array_priority__ = 1000
@@ -549,6 +546,12 @@ class _tensor_py_operators:
    def zeros_like(model, dtype=None):
        return theano.tensor.basic.zeros_like(model, dtype=dtype)
+    def cumsum(self, axis=None):
+        return theano.tensor.extra_ops.cumsum(self, axis)
+    def cumprod(self, axis=None):
+        return theano.tensor.extra_ops.cumprod(self, axis)
 class TensorVariable(_tensor_py_operators, Variable):
    """Subclass to add the tensor operators to the basic `Variable` class."""