Merge pull request #1946 from nouiz/scan

Small scan speed up on the GPU.

Merge pull request #1946 from nouiz/scan
38e2f502 · abergeron · fbd201b0 · 8fb5ac0a · 38e2f502 · 38e2f502
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -21,6 +21,8 @@ Montreal).
 News
 ====

+* Colin Raffel `tutorial on Theano <http://nbviewer.ipython.org/github/craffel/theano-tutorial/blob/master/Theano%20Tutorial.ipynb>`_.
+
 * Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.

 * Theano 0.6 was released. Everybody is encouraged to update.

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -120,8 +120,18 @@ enum = EnumStr("g++", "")
 try:
    rc = call_subprocess_Popen(['g++', '-v'])
 except OSError:
+    enum = EnumStr("")
    rc = 1
-if rc == 0:
+AddConfigVar('cxx',
+             "The C++ compiler to use. Currently only g++ is"
+             " supported, but supporting additional compilers should not be "
+             "too difficult. "
+             "If it is empty, no C++ code is compiled.",
+             enum,
+             in_c_key=False)
+del enum
+
+if rc == 0 and config.cxx != "":
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 ("Default linker used if the theano flags mode is Mode "
@@ -140,16 +150,6 @@ else:
            'optimized C-implementations (for both CPU and GPU) and will '
            'default to Python implementations. Performance will be severely '
            'degraded.')
-    enum = EnumStr("")
-
-AddConfigVar('cxx',
-             "The C++ compiler to use. Currently only g++ is"
-             " supported, but supporting additional compilers should not be "
-             "too difficult. "
-             "If it is empty, no C++ code is compiled.",
-             enum,
-             in_c_key=False)
-del enum


 #Keep the default value the same as the one for the mode FAST_RUN

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -12,7 +12,8 @@ import warnings

 from theano.gof.python25 import all

-from theano.configparser import config, AddConfigVar, BoolParam, ConfigParam
+from theano.configparser import (config, AddConfigVar,
+                                 BoolParam, ConfigParam, _config_var_list)

 import theano.gof.cmodule

@@ -560,7 +561,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
    # already changed the default linker to something else then CVM.
    # Currently this is the py linker.
    # Here we assert that the default linker is not cvm.
-    assert not [x for x in theano.configparser._config_var_list
+    assert not [x for x in _config_var_list
                if x.fullname == 'linker'][0].default.startswith('cvm'), e
    pass


--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -1411,9 +1411,10 @@ def norm(x,ord):
    elif ndim > 2:
        raise NotImplementedError("We don't support norm witn ndim > 2")

+
 class lstsq(theano.Op):
    def __eq__(self, other):
-        pass
+        return type(self) == type(other)

    def __hash__(self):
        return hash(type(self))

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -422,11 +422,19 @@ class Scan(PureOp):
                raise ValueError('For output %s you need to provide a '
                                 'scalar int !', str(outer_nitsot))
        assert len(new_inputs) == len(inputs)
-        self.vector_seqs = [seq.ndim == 1 for seq in
-                             new_inputs[1:1 + self.n_seqs]]
-        self.vector_outs = [arg.ndim == 1 for arg in
-                             new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
-                                                    self.n_outs)]]
+
+        # The vector_seqs and vector_outs are just a workaround
+        # strange NumPy behavior: vector_ndarray[int] return a NumPy
+        # scalar and not a NumPy ndarray of 0 dimensions.
+        self.vector_seqs = [isinstance(seq, (tensor.TensorVariable,
+                                             tensor.TensorConstant)) and
+                            seq.ndim == 1 for seq in
+                            new_inputs[1:1 + self.n_seqs]]
+        self.vector_outs = [isinstance(arg, (tensor.TensorVariable,
+                                             tensor.TensorConstant)) and
+                            arg.ndim == 1 for arg in
+                            new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
+                                                         self.n_outs)]]
        self.vector_outs += [False] * self.n_nit_sot

        apply_node = Apply(self,
@@ -598,12 +606,6 @@ class Scan(PureOp):
                for _d1 in range(cython_mit_mot_out_nslices[_d0]):
                    cython_mit_mot_out_slices[_d0, _d1] = \
                        self.mit_mot_out_slices[_d0][_d1]
-            vector_seqs = [seq.ndim == 1 for seq in
-                                 node.inputs[1:1 + self.n_seqs]]
-            vector_outs = [arg.ndim == 1 for arg in
-                                 node.inputs[1 + self.n_seqs:
-                                             (1 + self.n_seqs + self.n_outs)]]
-            vector_outs += [False] * self.n_nit_sot

            cython_vector_seqs = numpy.asarray(self.vector_seqs,
                                                    dtype='int32')

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
-import os, logging, sys
+import logging
+import os
+import sys
+
+import numpy

 import theano
 from theano import config
@@ -60,6 +64,28 @@ except ImportError:
                os.mkdir(loc)
            preargs = ['-fwrapv', '-O2', '-fno-strict-aliasing']
            preargs += cmodule.GCC_compiler.compile_args()
+            # Cython 19.1 always use the old NumPy interface.  So we
+            # need to manually modify the .c file to get it compiled
+            # by Theano. As by default, we tell NumPy to don't import
+            # the old interface.
+            if False:
+                #During scan cython development, it is helpful to keep the old interface, to don't manually edit the c file each time.
+                preargs.remove('-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION')
+            else:
+                numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
+                # Add add some macro to lower the number of edit
+                # needed to the c file.
+                if bool(numpy_ver >= [1, 7]):
+                    # Needed when we disable the old API, as cython
+                    # use the old interface
+                    preargs.append("-D NPY_ENSUREARRAY=NPY_ARRAY_ENSUREARRAY")
+                    preargs.append("-D NPY_ENSURECOPY=NPY_ARRAY_ENSURECOPY")
+                    preargs.append("-D NPY_ALIGNED=NPY_ARRAY_ALIGNED")
+                    preargs.append("-D NPY_WRITEABLE=NPY_ARRAY_WRITEABLE")
+                    preargs.append("-D NPY_UPDATE_ALL=NPY_ARRAY_UPDATE_ALL")
+                    preargs.append("-D NPY_C_CONTIGUOUS=NPY_ARRAY_C_CONTIGUOUS")
+                    preargs.append("-D NPY_F_CONTIGUOUS=NPY_ARRAY_F_CONTIGUOUS")
+
            cmodule.GCC_compiler.compile_str(dirname, code, location=loc,
                                             preargs=preargs)
            # Save version into the __init__.py file.

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -16,7 +16,7 @@ import theano.sandbox.rng_mrg
 from theano import tensor
 from theano.compile.pfunc import rebuild_collect_shared
 from theano.gof.python25 import any
-from theano.tests  import unittest_tools as utt
+from theano.tests import unittest_tools as utt
 import theano.scalar.sharedvar
 from theano.gof.python25 import OrderedDict
 from theano.compat import PY3
@@ -46,7 +46,7 @@ mode_with_gpu = mode_with_opt.including('gpu', 'scan')
 class multiple_outputs_numeric_grad:
    """WRITEME"""
    type_eps = {'float64': 1e-7,
-            'float32': 3e-3}
+                'float32': 3e-3}

    def __init__(self, f, pt, ndarray_mask=None, eps=None):
        """Return the gradient of f at pt.
@@ -81,12 +81,12 @@ class multiple_outputs_numeric_grad:
            if ndarray_mask[i]:
                pt[i] = numpy.array(p)
                _eps = multiple_outputs_numeric_grad.type_eps[str(
-                                            pt[i].dtype)]
+                    pt[i].dtype)]
                if _eps > dtype_eps:
                    dtype_eps = _eps

        self.ndarray_mask = ndarray_mask
-        #'''
+        # '''
        # Compute clean output:
        f_x = f(*pt)
        gx = []
@@ -148,7 +148,7 @@ class multiple_outputs_numeric_grad:
            return numpy.inf, 0


-#TODO: Test this function, and if it works,
+# TODO: Test this function, and if it works,
 # use it with the normal verify_grad rather than the
 # copy-and-pasted one above.
 # Also - add a reference to this technique in the
@@ -201,7 +201,6 @@ def grab_scan_node(output):


 class T_Scan(unittest.TestCase):
-#class T_Scan(object):

    def setUp(self):
        utt.seed_rng()
@@ -230,7 +229,7 @@ class T_Scan(unittest.TestCase):
                                updates=updates,
                                allow_input_downcast=True)

-        ### TESTING PICKLE-ing this function
+        # TESTING PICKLE-ing this function
        origdir = os.getcwd()
        tmpdir = None
        try:
@@ -367,7 +366,7 @@ class T_Scan(unittest.TestCase):
    # This first version test the first case in the optimizer to the gpu.
    def test_one_sequence_one_output_weights_gpu1(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')

        def f_rnn(u_t, x_tm1, W_in, W):
@@ -447,7 +446,7 @@ class T_Scan(unittest.TestCase):
    # This second version test the second case in the optimizer to the gpu.
    def test_one_sequence_one_output_weights_gpu2(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')

        def f_rnn(u_t, x_tm1, W_in, W):
@@ -511,7 +510,7 @@ class T_Scan(unittest.TestCase):
    # outputs when is running on GPU
    def test_gpu3_mixture_dtype_outputs(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')

        def f_rnn(u_t, x_tm1, W_in, W):
@@ -595,11 +594,11 @@ class T_Scan(unittest.TestCase):
        v_out = numpy.zeros((4,))
        v_out[0] = v_u[0] * W_in.get_value() + v_x0 * W.get_value()
        for step in xrange(1, 4):
-            v_out[step] = v_u[step] * W_in.get_value() + \
-                    v_out[step - 1] * W.get_value()
+            v_out[step] = (v_u[step] * W_in.get_value() +
+                           v_out[step - 1] * W.get_value())

        theano_values = f3(v_u, v_x0)
-        assert  numpy.allclose(theano_values, v_out)
+        assert numpy.allclose(theano_values, v_out)

    # some rnn with multiple outputs and multiple inputs; other
    # dimension instead of scalars/vectors
@@ -624,7 +623,7 @@ class T_Scan(unittest.TestCase):
        y0 = theano.tensor.scalar('y0')

        def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
-            return [theano.dot(u1_t, W_in1) + u2_t * W_in2 + \
+            return [theano.dot(u1_t, W_in1) + u2_t * W_in2 +
                    theano.dot(x_tm1, W), theano.dot(x_tm1, W_out)]

        outputs, updates = theano.scan(f_rnn_cmpl,
@@ -643,12 +642,12 @@ class T_Scan(unittest.TestCase):
        # compute the values in numpy
        v_x = numpy.zeros((3, 2), dtype=theano.config.floatX)
        v_y = numpy.zeros((3,), dtype=theano.config.floatX)
-        v_x[0] = numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 + \
-                    numpy.dot(v_x0, vW)
+        v_x[0] = (numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 +
+                  numpy.dot(v_x0, vW))
        v_y[0] = numpy.dot(v_x0, vWout)
        for i in xrange(1, 3):
-            v_x[i] = numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 + \
-                        numpy.dot(v_x[i - 1], vW)
+            v_x[i] = (numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 +
+                      numpy.dot(v_x[i - 1], vW))
            v_y[i] = numpy.dot(v_x[i - 1], vWout)

        (theano_x, theano_y) = f4(v_u1, v_u2, v_x0, v_y0, vW_in1)
@@ -684,9 +683,9 @@ class T_Scan(unittest.TestCase):
                       y_tm1,
                       y_tm3,
                       W_in1):
-            return [theano.dot(u1_t, W_in1) + \
-                        (u2_t + u2_tm1 * u2_tp1) * W_in2 + \
-                        theano.dot(x_tm1, W),
+            return [theano.dot(u1_t, W_in1) +
+                    (u2_t + u2_tm1 * u2_tp1) * W_in2 +
+                    theano.dot(x_tm1, W),
                    (y_tm1 + y_tm3) * theano.dot(x_tm1, W_out),
                    theano.dot(u1_t, W_in1)]

@@ -891,10 +890,10 @@ class T_Scan(unittest.TestCase):
        numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
        numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu1[0] + vu2[0]
        for i in xrange(1, 3):
-            numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
-                    vu1[i] * vu2[i]
-            numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
-                    vu1[i] + vu2[i]
+            numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
+                           vu1[i] * vu2[i])
+            numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
+                           vu1[i] + vu2[i])

        # note theano computes inplace, so call function after numpy
        # equivalent is done
@@ -908,8 +907,8 @@ class T_Scan(unittest.TestCase):
        # Old way of doing inplace operations is deprecated .. tests don't
        # make sense anymore.

-        ##utt.assert_allclose( theano_x0 , vu2)
-        ## utt.assert_allclose( theano_x1 , vu1)
+        ## utt.assert_allclose(theano_x0 , vu2)
+        ## utt.assert_allclose(theano_x1 , vu1)

    # simple rnn ; compute inplace version 2
    def test_inplace2(self):
@@ -965,16 +964,16 @@ class T_Scan(unittest.TestCase):
                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
        assert 0 in scan_node[0].op.destroy_map.keys()
        assert 1 in scan_node[0].op.destroy_map.keys()
-       # compute output in numpy
+        # compute output in numpy
        numpy_x0 = numpy.zeros((3,))
        numpy_x1 = numpy.zeros((3,))
        numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu1[1]
        numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu2[0] + vu2[1] + vu2[2]
        for i in xrange(1, 3):
-            numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
-                    vu1[i] * vu1[i + 1]
-            numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
-                    vu2[i] + vu2[i + 1] + vu2[i + 2]
+            numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
+                           vu1[i] * vu1[i + 1])
+            numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
+                           vu2[i] + vu2[i + 1] + vu2[i + 2])

        # note theano computes inplace, so call function after numpy
        # equivalent is done
@@ -1069,8 +1068,8 @@ class T_Scan(unittest.TestCase):
        y1 = theano.shared(vy1, 'y1')

        def f(u1_t, u2_t, y0_tm3, y0_tm2, y0_tm1, y1_tm1):
-            y0_t = theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 + \
-                    0.33 * y0_tm2 + 0.17 * y0_tm3
+            y0_t = (theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 +
+                    0.33 * y0_tm2 + 0.17 * y0_tm3)
            y1_t = theano.dot(u2_t, W2) + y1_tm1
            y2_t = theano.dot(u1_t, W1)
            nwW1 = W1 + .1
@@ -1106,14 +1105,13 @@ class T_Scan(unittest.TestCase):
        numpy_W1 = vW1.copy()
        numpy_W2 = vW2.copy()
        for idx in xrange(3):
-            numpy_y0[idx + 3] = numpy.dot(\
-                                          numpy.dot(vu1[idx, :], numpy_W1), \
+            numpy_y0[idx + 3] = numpy.dot(numpy.dot(vu1[idx, :], numpy_W1),
                                          numpy_W2) + \
                                0.1 * numpy_y0[idx + 2] + \
                                0.33 * numpy_y0[idx + 1] + \
                                0.17 * numpy_y0[idx]
-            numpy_y1[idx + 1] = numpy.dot(vu2[idx, :], numpy_W2) +\
-                                numpy_y1[idx]
+            numpy_y1[idx + 1] = (numpy.dot(vu2[idx, :], numpy_W2) +
+                                 numpy_y1[idx])
            numpy_y2[idx] = numpy.dot(vu1[idx, :], numpy_W1)
            numpy_W1 = numpy_W1 + .1
            numpy_W2 = numpy_W2 + .05
@@ -1168,7 +1166,7 @@ class T_Scan(unittest.TestCase):

    def test_simple_shared_random(self):
        theano_rng = theano.tensor.shared_randomstreams.RandomStreams(
-                            utt.fetch_seed())
+            utt.fetch_seed())

        values, updates = theano.scan(lambda: theano_rng.uniform((2,), -1, 1),
                                      [],
@@ -1196,7 +1194,7 @@ class T_Scan(unittest.TestCase):

    def test_cuda_gibbs_chain(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')

        rng = numpy.random.RandomState(utt.fetch_seed())
@@ -1204,7 +1202,7 @@ class T_Scan(unittest.TestCase):
                                dtype='float32')
        vsample = theano.shared(v_vsample)
        trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
-                                utt.fetch_seed())
+            utt.fetch_seed())

        def f(vsample_tm1):
            return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
@@ -1240,7 +1238,7 @@ class T_Scan(unittest.TestCase):
        bvis = theano.shared(v_bvis, 'vbvis')
        vsample = theano.tensor.matrix(dtype='float32')
        trng = theano.tensor.shared_randomstreams.RandomStreams(
-                                utt.fetch_seed())
+            utt.fetch_seed())

        def f(vsample_tm1):
            hmean_t = theano.tensor.nnet.sigmoid(

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -5033,10 +5033,10 @@ def power(x, y):
    return x**y


-def swapaxes(y,axis1,axis2):
-	"swap axes of inputted tensor"
-	y = as_tensor_variable(y)
-	ndim = y.ndim
-	li = range(0, ndim)
-	li[axis1], li[axis2] = li[axis2], li[axis1]
-	return y.dimshuffle(li)
+def swapaxes(y, axis1, axis2):
+    "swap axes of inputted tensor"
+    y = as_tensor_variable(y)
+    ndim = y.ndim
+    li = range(0, ndim)
+    li[axis1], li[axis2] = li[axis2], li[axis1]
+    return y.dimshuffle(li)