Merge pull request #1946 from nouiz/scan

Small scan speed up on the GPU.

Merge pull request #1946 from nouiz/scan
38e2f502 · abergeron · fbd201b0 · 8fb5ac0a · 38e2f502 · 38e2f502
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -21,6 +21,8 @@ Montreal).
 News
 ====
+* Colin Raffel `tutorial on Theano <http://nbviewer.ipython.org/github/craffel/theano-tutorial/blob/master/Theano%20Tutorial.ipynb>`_.
 * Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.
 * Theano 0.6 was released. Everybody is encouraged to update.

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -120,8 +120,18 @@ enum = EnumStr("g++", "")
 try:
    rc = call_subprocess_Popen(['g++', '-v'])
 except OSError:
+    enum = EnumStr("")
    rc = 1
-if rc == 0:
+AddConfigVar('cxx',
+             "The C++ compiler to use. Currently only g++ is"
+             " supported, but supporting additional compilers should not be "
+             "too difficult. "
+             "If it is empty, no C++ code is compiled.",
+             enum,
+             in_c_key=False)
+del enum
+if rc == 0 and config.cxx != "":
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 ("Default linker used if the theano flags mode is Mode "
@@ -140,16 +150,6 @@ else:
            'optimized C-implementations (for both CPU and GPU) and will '
            'default to Python implementations. Performance will be severely '
            'degraded.')
-    enum = EnumStr("")
-AddConfigVar('cxx',
-             "The C++ compiler to use. Currently only g++ is"
-             " supported, but supporting additional compilers should not be "
-             "too difficult. "
-             "If it is empty, no C++ code is compiled.",
-             enum,
-             in_c_key=False)
-del enum
 #Keep the default value the same as the one for the mode FAST_RUN

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -12,7 +12,8 @@ import warnings
 from theano.gof.python25 import all
-from theano.configparser import config, AddConfigVar, BoolParam, ConfigParam
+from theano.configparser import (config, AddConfigVar,
+                                 BoolParam, ConfigParam, _config_var_list)
 import theano.gof.cmodule
@@ -560,7 +561,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
    # already changed the default linker to something else then CVM.
    # Currently this is the py linker.
    # Here we assert that the default linker is not cvm.
-    assert not [x for x in theano.configparser._config_var_list
+    assert not [x for x in _config_var_list
                if x.fullname == 'linker'][0].default.startswith('cvm'), e
    pass

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -1411,9 +1411,10 @@ def norm(x,ord):
    elif ndim > 2:
        raise NotImplementedError("We don't support norm witn ndim > 2")
 class lstsq(theano.Op):
    def __eq__(self, other):
-        pass
+        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -422,9 +422,17 @@ class Scan(PureOp):
                raise ValueError('For output %s you need to provide a '
                                 'scalar int !', str(outer_nitsot))
        assert len(new_inputs) == len(inputs)
-        self.vector_seqs = [seq.ndim == 1 for seq in
+        # The vector_seqs and vector_outs are just a workaround
+        # strange NumPy behavior: vector_ndarray[int] return a NumPy
+        # scalar and not a NumPy ndarray of 0 dimensions.
+        self.vector_seqs = [isinstance(seq, (tensor.TensorVariable,
+                                             tensor.TensorConstant)) and
+                            seq.ndim == 1 for seq in
                            new_inputs[1:1 + self.n_seqs]]
-        self.vector_outs = [arg.ndim == 1 for arg in
+        self.vector_outs = [isinstance(arg, (tensor.TensorVariable,
+                                             tensor.TensorConstant)) and
+                            arg.ndim == 1 for arg in
                            new_inputs[1 + self.n_seqs: (1 + self.n_seqs +
                                                         self.n_outs)]]
        self.vector_outs += [False] * self.n_nit_sot
@@ -598,12 +606,6 @@ class Scan(PureOp):
                for _d1 in range(cython_mit_mot_out_nslices[_d0]):
                    cython_mit_mot_out_slices[_d0, _d1] = \
                        self.mit_mot_out_slices[_d0][_d1]
-            vector_seqs = [seq.ndim == 1 for seq in
-                                 node.inputs[1:1 + self.n_seqs]]
-            vector_outs = [arg.ndim == 1 for arg in
-                                 node.inputs[1 + self.n_seqs:
-                                             (1 + self.n_seqs + self.n_outs)]]
-            vector_outs += [False] * self.n_nit_sot
            cython_vector_seqs = numpy.asarray(self.vector_seqs,
                                                    dtype='int32')

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
-import os, logging, sys
+import logging
+import os
+import sys
+import numpy
 import theano
 from theano import config
@@ -60,6 +64,28 @@ except ImportError:
                os.mkdir(loc)
            preargs = ['-fwrapv', '-O2', '-fno-strict-aliasing']
            preargs += cmodule.GCC_compiler.compile_args()
+            # Cython 19.1 always use the old NumPy interface.  So we
+            # need to manually modify the .c file to get it compiled
+            # by Theano. As by default, we tell NumPy to don't import
+            # the old interface.
+            if False:
+                #During scan cython development, it is helpful to keep the old interface, to don't manually edit the c file each time.
+                preargs.remove('-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION')
+            else:
+                numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
+                # Add add some macro to lower the number of edit
+                # needed to the c file.
+                if bool(numpy_ver >= [1, 7]):
+                    # Needed when we disable the old API, as cython
+                    # use the old interface
+                    preargs.append("-D NPY_ENSUREARRAY=NPY_ARRAY_ENSUREARRAY")
+                    preargs.append("-D NPY_ENSURECOPY=NPY_ARRAY_ENSURECOPY")
+                    preargs.append("-D NPY_ALIGNED=NPY_ARRAY_ALIGNED")
+                    preargs.append("-D NPY_WRITEABLE=NPY_ARRAY_WRITEABLE")
+                    preargs.append("-D NPY_UPDATE_ALL=NPY_ARRAY_UPDATE_ALL")
+                    preargs.append("-D NPY_C_CONTIGUOUS=NPY_ARRAY_C_CONTIGUOUS")
+                    preargs.append("-D NPY_F_CONTIGUOUS=NPY_ARRAY_F_CONTIGUOUS")
            cmodule.GCC_compiler.compile_str(dirname, code, location=loc,
                                             preargs=preargs)
            # Save version into the __init__.py file.

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -86,7 +86,7 @@ class multiple_outputs_numeric_grad:
                    dtype_eps = _eps
        self.ndarray_mask = ndarray_mask
-        #'''
+        # '''
        # Compute clean output:
        f_x = f(*pt)
        gx = []
@@ -148,7 +148,7 @@ class multiple_outputs_numeric_grad:
            return numpy.inf, 0
-#TODO: Test this function, and if it works,
+# TODO: Test this function, and if it works,
 # use it with the normal verify_grad rather than the
 # copy-and-pasted one above.
 # Also - add a reference to this technique in the
@@ -201,7 +201,6 @@ def grab_scan_node(output):
 class T_Scan(unittest.TestCase):
-#class T_Scan(object):
    def setUp(self):
        utt.seed_rng()
@@ -230,7 +229,7 @@ class T_Scan(unittest.TestCase):
                                updates=updates,
                                allow_input_downcast=True)
-        ### TESTING PICKLE-ing this function
+        # TESTING PICKLE-ing this function
        origdir = os.getcwd()
        tmpdir = None
        try:
@@ -367,7 +366,7 @@ class T_Scan(unittest.TestCase):
    # This first version test the first case in the optimizer to the gpu.
    def test_one_sequence_one_output_weights_gpu1(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')
        def f_rnn(u_t, x_tm1, W_in, W):
@@ -447,7 +446,7 @@ class T_Scan(unittest.TestCase):
    # This second version test the second case in the optimizer to the gpu.
    def test_one_sequence_one_output_weights_gpu2(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')
        def f_rnn(u_t, x_tm1, W_in, W):
@@ -511,7 +510,7 @@ class T_Scan(unittest.TestCase):
    # outputs when is running on GPU
    def test_gpu3_mixture_dtype_outputs(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')
        def f_rnn(u_t, x_tm1, W_in, W):
@@ -595,8 +594,8 @@ class T_Scan(unittest.TestCase):
        v_out = numpy.zeros((4,))
        v_out[0] = v_u[0] * W_in.get_value() + v_x0 * W.get_value()
        for step in xrange(1, 4):
-            v_out[step] = v_u[step] * W_in.get_value() + \
+            v_out[step] = (v_u[step] * W_in.get_value() +
-                    v_out[step - 1] * W.get_value()
+                           v_out[step - 1] * W.get_value())
        theano_values = f3(v_u, v_x0)
        assert numpy.allclose(theano_values, v_out)
@@ -624,7 +623,7 @@ class T_Scan(unittest.TestCase):
        y0 = theano.tensor.scalar('y0')
        def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
-            return [theano.dot(u1_t, W_in1) + u2_t * W_in2 + \
+            return [theano.dot(u1_t, W_in1) + u2_t * W_in2 +
                    theano.dot(x_tm1, W), theano.dot(x_tm1, W_out)]
        outputs, updates = theano.scan(f_rnn_cmpl,
@@ -643,12 +642,12 @@ class T_Scan(unittest.TestCase):
        # compute the values in numpy
        v_x = numpy.zeros((3, 2), dtype=theano.config.floatX)
        v_y = numpy.zeros((3,), dtype=theano.config.floatX)
-        v_x[0] = numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 + \
+        v_x[0] = (numpy.dot(v_u1[0], vW_in1) + v_u2[0] * vW_in2 +
-                    numpy.dot(v_x0, vW)
+                  numpy.dot(v_x0, vW))
        v_y[0] = numpy.dot(v_x0, vWout)
        for i in xrange(1, 3):
-            v_x[i] = numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 + \
+            v_x[i] = (numpy.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 +
-                        numpy.dot(v_x[i - 1], vW)
+                      numpy.dot(v_x[i - 1], vW))
            v_y[i] = numpy.dot(v_x[i - 1], vWout)
        (theano_x, theano_y) = f4(v_u1, v_u2, v_x0, v_y0, vW_in1)
@@ -684,8 +683,8 @@ class T_Scan(unittest.TestCase):
                       y_tm1,
                       y_tm3,
                       W_in1):
-            return [theano.dot(u1_t, W_in1) + \
+            return [theano.dot(u1_t, W_in1) +
-                        (u2_t + u2_tm1 * u2_tp1) * W_in2 + \
+                    (u2_t + u2_tm1 * u2_tp1) * W_in2 +
                    theano.dot(x_tm1, W),
                    (y_tm1 + y_tm3) * theano.dot(x_tm1, W_out),
                    theano.dot(u1_t, W_in1)]
@@ -891,10 +890,10 @@ class T_Scan(unittest.TestCase):
        numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
        numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu1[0] + vu2[0]
        for i in xrange(1, 3):
-            numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
+            numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
-                    vu1[i] * vu2[i]
+                           vu1[i] * vu2[i])
-            numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
+            numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
-                    vu1[i] + vu2[i]
+                           vu1[i] + vu2[i])
        # note theano computes inplace, so call function after numpy
        # equivalent is done
@@ -908,8 +907,8 @@ class T_Scan(unittest.TestCase):
        # Old way of doing inplace operations is deprecated .. tests don't
        # make sense anymore.
-        ##utt.assert_allclose( theano_x0 , vu2)
+        ## utt.assert_allclose(theano_x0 , vu2)
-        ## utt.assert_allclose( theano_x1 , vu1)
+        ## utt.assert_allclose(theano_x1 , vu1)
    # simple rnn ; compute inplace version 2
    def test_inplace2(self):
@@ -971,10 +970,10 @@ class T_Scan(unittest.TestCase):
        numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu1[1]
        numpy_x1[0] = vu0[0] * vW_in + vx1 * vW + vu2[0] + vu2[1] + vu2[2]
        for i in xrange(1, 3):
-            numpy_x0[i] = vu0[i] * vW_in + numpy_x0[i - 1] * vW + \
+            numpy_x0[i] = (vu0[i] * vW_in + numpy_x0[i - 1] * vW +
-                    vu1[i] * vu1[i + 1]
+                           vu1[i] * vu1[i + 1])
-            numpy_x1[i] = vu0[i] * vW_in + numpy_x1[i - 1] * vW + \
+            numpy_x1[i] = (vu0[i] * vW_in + numpy_x1[i - 1] * vW +
-                    vu2[i] + vu2[i + 1] + vu2[i + 2]
+                           vu2[i] + vu2[i + 1] + vu2[i + 2])
        # note theano computes inplace, so call function after numpy
        # equivalent is done
@@ -1069,8 +1068,8 @@ class T_Scan(unittest.TestCase):
        y1 = theano.shared(vy1, 'y1')
        def f(u1_t, u2_t, y0_tm3, y0_tm2, y0_tm1, y1_tm1):
-            y0_t = theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 + \
+            y0_t = (theano.dot(theano.dot(u1_t, W1), W2) + 0.1 * y0_tm1 +
-                    0.33 * y0_tm2 + 0.17 * y0_tm3
+                    0.33 * y0_tm2 + 0.17 * y0_tm3)
            y1_t = theano.dot(u2_t, W2) + y1_tm1
            y2_t = theano.dot(u1_t, W1)
            nwW1 = W1 + .1
@@ -1106,14 +1105,13 @@ class T_Scan(unittest.TestCase):
        numpy_W1 = vW1.copy()
        numpy_W2 = vW2.copy()
        for idx in xrange(3):
-            numpy_y0[idx + 3] = numpy.dot(\
+            numpy_y0[idx + 3] = numpy.dot(numpy.dot(vu1[idx, :], numpy_W1),
-                                          numpy.dot(vu1[idx, :], numpy_W1), \
                                          numpy_W2) + \
                                0.1 * numpy_y0[idx + 2] + \
                                0.33 * numpy_y0[idx + 1] + \
                                0.17 * numpy_y0[idx]
-            numpy_y1[idx + 1] = numpy.dot(vu2[idx, :], numpy_W2) +\
+            numpy_y1[idx + 1] = (numpy.dot(vu2[idx, :], numpy_W2) +
-                                numpy_y1[idx]
+                                 numpy_y1[idx])
            numpy_y2[idx] = numpy.dot(vu1[idx, :], numpy_W1)
            numpy_W1 = numpy_W1 + .1
            numpy_W2 = numpy_W2 + .05
@@ -1196,7 +1194,7 @@ class T_Scan(unittest.TestCase):
    def test_cuda_gibbs_chain(self):
        from theano.sandbox import cuda
-        if cuda.cuda_available == False:
+        if not cuda.cuda_available:
            raise SkipTest('Optional package cuda disabled')
        rng = numpy.random.RandomState(utt.fetch_seed())

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -5033,7 +5033,7 @@ def power(x, y):
    return x**y
-def swapaxes(y,axis1,axis2):
+def swapaxes(y, axis1, axis2):
    "swap axes of inputted tensor"
    y = as_tensor_variable(y)
    ndim = y.ndim