Merge pull request #1442 from nouiz/mixed2

[WIP] Use the new grad interface.

Merge pull request #1442 from nouiz/mixed2
85db6f61 · lamblin · 51b39ada · 3a1e910c · 85db6f61 · 85db6f61
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -124,6 +124,27 @@ Do like in the section "Updating Theano", but use
 .. _install_ubuntu_gpu:
+Manual Openblas instruction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The openblas included in Ubuntu is limited to 2 threads. If you want
+to use more cores at the same time, you will need to compile it
+yourself. Here is some code that will help you.
+.. code-block:: bash
+    # remove openblas if you installed it
+    sudo apt-get remove libopenblas-base
+    # Download the development version of OpenBLAS
+    git clone git://github.com/xianyi/OpenBLAS
+    cd OpenBLAS
+    make FC=gfortran
+    sudo make PREFIX=/usr/local/ install
+    cd /usr/local/lib
+    ln -s libopenblas.so /usr/lib/libblas.so
+    ln -s libopenblas.so.0 /usr/lib/libblas.so.3gf
 Contributed GPU instruction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -8,6 +8,7 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
 import __builtin__
+from itertools import izip
 import logging
 import warnings
 _logger = logging.getLogger('theano.gradient')
@@ -17,7 +18,6 @@ np = numpy
 import theano
-from itertools import izip
 from theano import gof
 from theano.gof import Variable
 from theano.gof.python25 import OrderedDict
@@ -78,10 +78,9 @@ def grad_not_implemented(op, x_pos, x, comment=""):
    gradient is not implemented.
    """
-    return (NullType(
+    return (NullType((
-        (
+        "This variable is Null because the grad method for "
-            "This variable is Null because the grad method for "
+        "input %s (%s) of the %s op is not implemented. %s"
-            "input %s (%s) of the %s op is not implemented. %s"
        ) % (x_pos, x, op, comment)))()
@@ -341,8 +340,8 @@ def Lop(f, wrt, eval_points, consider_constant=None,
    known = dict(izip(f, grads))
    ret = grad(cost=None, known_grads=known,
-            consider_constant=consider_constant, wrt=wrt,
+               consider_constant=consider_constant, wrt=wrt,
-            disconnected_inputs=disconnected_inputs)
+               disconnected_inputs=disconnected_inputs)
    return format_as(using_list, using_tuple, ret)
@@ -352,8 +351,8 @@ def Lop(f, wrt, eval_points, consider_constant=None,
 #########################
 def grad(cost, wrt, consider_constant=None,
-        disconnected_inputs='raise', add_names=True,
+         disconnected_inputs='raise', add_names=True,
-        known_grads=None, return_disconnected='zero'):
+         known_grads=None, return_disconnected='zero'):
    """
    :type cost: Scalar (0-dimensional) Variable.
        May optionally be None if known_grads is provided.
@@ -406,17 +405,16 @@ def grad(cost, wrt, consider_constant=None,
    if cost is not None and isinstance(cost.type, NullType):
        raise ValueError("Can't differentiate a NaN cost."
-            "cost is NaN because " + \
+                         "cost is NaN because " +
-                cost.type.why_null)
+                         cost.type.why_null)
    if cost is not None and cost.ndim != 0:
        raise TypeError("cost must be a scalar.")
    if isinstance(wrt, set):
        raise TypeError("wrt must not be a set. sets have no defined "
-                "iteration order, so we can't return gradients in a matching"
+                        "iteration order, so we can't return gradients in a"
-                " order.")
+                        "  matching order.")
    using_list = isinstance(wrt, list)
    using_tuple = isinstance(wrt, tuple)
@@ -426,7 +424,7 @@ def grad(cost, wrt, consider_constant=None,
    for elem in wrt:
        if not isinstance(elem, Variable):
            raise TypeError("Expected Variable, got " + str(elem) +
-                    " of type "+str(type(elem)))
+                            " of type " + str(type(elem)))
    outputs = []
    if cost is not None:
@@ -435,7 +433,7 @@ def grad(cost, wrt, consider_constant=None,
        outputs.extend(known_grads.keys())
    var_to_app_to_idx = _populate_var_to_app_to_idx(
-            outputs, wrt, consider_constant)
+        outputs, wrt, consider_constant)
    # build a dict mapping var to the gradient of cost with respect to var
    grad_dict = OrderedDict()
@@ -452,7 +450,8 @@ def grad(cost, wrt, consider_constant=None,
        # g_cost may be Disconnected or NullType. A creative use of the function,
        # sure, but nonetheless one we can and should support. So before we try
        # to cast it make sure it even has a dtype
-        if hasattr(g_cost.type, 'dtype') and cost.type.dtype not in tensor.discrete_dtypes:
+        if (hasattr(g_cost.type, 'dtype') and
+            cost.type.dtype not in tensor.discrete_dtypes):
            # Here we enforce the constraint that floating point variables have
            # the same dtype as their gradient.
            g_cost = g_cost.astype(cost.type.dtype)
@@ -471,8 +470,8 @@ def grad(cost, wrt, consider_constant=None,
                'Ambiguous whether %s should be made into tensor'
                ' or sparse theano variable' % str(type(g_var)))
-        if not isinstance(g_var.type, (NullType, DisconnectedType)) and 'float' \
+        if (not isinstance(g_var.type, (NullType, DisconnectedType)) and
-            not in str(g_var.type.dtype):
+            'float' not in str(g_var.type.dtype)):
            raise TypeError("Gradients must always be NullType, "
                    "DisconnectedType, or continuous, but grad was "
                    "given a known_grad of type "+str(g_var.type))
@@ -728,11 +727,13 @@ def _populate_var_to_app_to_idx(outputs, wrt, consider_constant):
    return var_to_app_to_idx
 class NullTypeGradError(TypeError):
    """
    Raised when grad encounters a NullType.
    """
 class DisconnectedInputError(ValueError):
    """
    Raised when grad is asked to compute the gradient
@@ -740,8 +741,9 @@ class DisconnectedInputError(ValueError):
    disconnected_inputs='raise'.
    """
 def _populate_grad_dict(var_to_app_to_idx,
-        grad_dict, wrt, cost_name=None):
+                        grad_dict, wrt, cost_name=None):
    """
        Helper function for grad function.
@@ -783,7 +785,7 @@ def _populate_grad_dict(var_to_app_to_idx,
            # list of bools indicating if each output is connected to the cost
            outputs_connected = [not isinstance(g.type, DisconnectedType)
-                    for g in output_grads]
+                                 for g in output_grads]
            connection_pattern = _node_to_pattern(node)
@@ -840,7 +842,7 @@ def _populate_grad_dict(var_to_app_to_idx,
                # each destroyed input.
                try:
                    dinputs = [node.inputs[x[0]] for x in
-                            node.op.destroy_map.values()]
+                               node.op.destroy_map.values()]
                except AttributeError:
                    dinputs = []
@@ -899,11 +901,11 @@ def _populate_grad_dict(var_to_app_to_idx,
                if input_grads is None:
                    raise TypeError("%s.grad returned NoneType, "
-                            "expected iterable." % str(node.op))
+                                    "expected iterable." % str(node.op))
                if len(input_grads) != len(inputs):
-                    raise ValueError(("%s returned the wrong number of" +\
+                    raise ValueError(("%s returned the wrong number of" +
-                            " gradient terms.") % str(node.op))
+                                      " gradient terms.") % str(node.op))
            # must convert to list in case the op returns a tuple
            # we won't be able to post-process out the Nones if it does that
@@ -926,7 +928,7 @@ def _populate_grad_dict(var_to_app_to_idx,
                    # used to mean undefined, zero, or disconnected.
                    # We therefore don't allow it because its usage has become
                    # so muddied.
-                    raise TypeError(('%s.grad returned None for' +\
+                    raise TypeError(('%s.grad returned None for' +
                             ' a gradient term, '
                            'this is prohibited. Instead of None,'
                            'return zeros_like(input), DisconnectedType()(),'
@@ -964,7 +966,7 @@ def _populate_grad_dict(var_to_app_to_idx,
                            msg += "verifiably zeros."
                            msg = msg % (str(node.op), str(term),
-                                    str(type(term)), i)
+                                         str(type(term)), i)
                        if is_zero == 'no':
                            msg = "%s.grad returned %s of type %s for input"
@@ -980,8 +982,8 @@ def _populate_grad_dict(var_to_app_to_idx,
            #Check that op.connection_pattern matches the connectivity
            #logic driving the op.grad method
-            for i, packed in \
+            for i, packed in enumerate(zip(inputs, input_grads,
-                enumerate(zip(inputs, input_grads, inputs_connected)):
+                                           inputs_connected)):
                ipt, ig, connected = packed
                actually_connected = \
                    not isinstance(ig.type, DisconnectedType)
@@ -1027,11 +1029,11 @@ def _populate_grad_dict(var_to_app_to_idx,
                        if not isinstance(term, gof.Variable):
                            raise TypeError("%s.grad returned %s, expected"
                                    " Variable instance." % (str(node.op),
-                                        type(term)))
+                                                             type(term)))
                        if isinstance(term.type, NullType):
                            raise NullTypeGradError("tensor.grad "
-                                "encountered a NaN. " +\
+                                "encountered a NaN. " +
                                    term.type.why_null)
                        #Don't try to sum up DisconnectedType placeholders
@@ -1121,9 +1123,9 @@ class numeric_grad(object):
    # For now, we use a heuristic that catches very bad gradients, but is not
    # perfectly accurate.
    type_eps = {'float64': 1e-7,
-            'float32': 3e-4,
+                'float32': 3e-4,
-            numpy.dtype('float64'): 1e-7,
+                numpy.dtype('float64'): 1e-7,
-            numpy.dtype('float32'): 3e-4}
+                numpy.dtype('float32'): 3e-4}
    def __init__(self, f, pt, eps=None, out_type=None):
        """Return the gradient of f at pt.
@@ -1243,15 +1245,13 @@ class numeric_grad(object):
        """
        if len(g_pt) != len(self.gf):
-            raise ValueError(
+            raise ValueError('argument has wrong number of elements',
-                    'argument has wrong number of elements',
+                             len(g_pt))
-                    len(g_pt))
        errs = []
        for i, (a, b) in enumerate(zip(g_pt, self.gf)):
            if a.shape != b.shape:
-                raise ValueError(
+                raise ValueError('argument element %i has wrong shape %s' % (
-                        'argument element %i has wrong shape %s' % (
+                    i, str((a.shape, b.shape))))
-                            i, str((a.shape, b.shape))))
            errs.append(numeric_grad.abs_rel_err(a, b))
        return errs
@@ -1287,7 +1287,7 @@ class numeric_grad(object):
        # max over the arrays in g_pt
        max_arg = numpy.argmax(errs)
        max_pos = pos[max_arg]
-        return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
+        return (max_arg, max_pos, abs_errs[max_arg], rel_errs[max_arg])
 def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
@@ -1336,9 +1336,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
        covers that case as well by using random projections.
    """
+    # The import is here to prevent circular import.
    from theano import compile, shared
    import theano.tensor
-    from theano.tensor import as_tensor_variable, cast, TensorType
+    from theano.tensor import as_tensor_variable, TensorType
    assert isinstance(pt, (list, tuple))
    pt = [numpy.array(p) for p in pt]
@@ -1368,11 +1369,12 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    def function(inputs, output):
        if mode is None:
            f = compile.function(inputs, output, accept_inplace=True,
-                    allow_input_downcast=True, on_unused_input='ignore')
+                                 allow_input_downcast=True,
+                                 on_unused_input='ignore')
        else:
            f = compile.function(inputs, output, accept_inplace=True,
-                    allow_input_downcast=True, mode=mode,
+                                 allow_input_downcast=True, mode=mode,
-                    on_unused_input='ignore')
+                                 on_unused_input='ignore')
        return f
    tensor_pt = [TensorType(
@@ -1421,24 +1423,32 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    grad_fn = function(tensor_pt, symbolic_grad)
    for test_num in xrange(n_tests):
-        num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps, out_type)
+        try:
+            num_grad = numeric_grad(cost_fn, [p.copy() for p in pt],
+                                    eps, out_type)
-        analytic_grad = grad_fn(*[p.copy() for p in pt])
+            analytic_grad = grad_fn(*[p.copy() for p in pt])
-        # Since `tensor_pt` is a list, `analytic_grad` should be one too.
+            # Since `tensor_pt` is a list, `analytic_grad` should be one too.
-        assert isinstance(analytic_grad, list)
+            assert isinstance(analytic_grad, list)
-        max_arg, max_err_pos, max_abs_err, max_rel_err =\
+            max_arg, max_err_pos, max_abs_err, max_rel_err = num_grad.max_err(
-                num_grad.max_err(analytic_grad, abs_tol, rel_tol)
+                analytic_grad, abs_tol, rel_tol)
-        if max_abs_err > abs_tol and max_rel_err > rel_tol:
+            if max_abs_err > abs_tol and max_rel_err > rel_tol:
-            raise verify_grad.E_grad(max_arg, max_err_pos,
+                raise verify_grad.E_grad(max_arg, max_err_pos,
-                    max_abs_err, max_rel_err, abs_tol, rel_tol)
+                                         max_abs_err, max_rel_err,
+                                         abs_tol, rel_tol)
-        # get new random projection for next test
+            # get new random projection for next test
-        if test_num < n_tests - 1:
+            if test_num < n_tests - 1:
-            t_r.set_value(random_projection(), borrow=True)
+                t_r.set_value(random_projection(), borrow=True)
+        except Exception, e:
+            e.args += ("\nThe error happened with the following inputs:", pt,
+                       "\nThe value of eps is:", eps,
+                       "\nThe out_type is:", out_type)
+            raise
 class GradientError(Exception):
@@ -1517,9 +1527,9 @@ def jacobian(expression, wrt, consider_constant=None,
        rvals = []
        for inp in args[2:]:
            rval = grad(expr[idx],
-                     inp,
+                        inp,
-                     consider_constant=consider_constant,
+                        consider_constant=consider_constant,
-                     disconnected_inputs=disconnected_inputs)
+                        disconnected_inputs=disconnected_inputs)
            rvals.append(rval)
        return rvals
    # Computing the gradients does not affect the random seeds on any random
@@ -1527,8 +1537,8 @@ def jacobian(expression, wrt, consider_constant=None,
    # just backtracking over old values. (rp Jan 2012 - if anyone has a
    # counter example please show me)
    jacobs, updates = theano.scan(inner_function,
-                            sequences=arange(expression.shape[0]),
+                                  sequences=arange(expression.shape[0]),
-                            non_sequences=[expression] + wrt)
+                                  non_sequences=[expression] + wrt)
    assert not updates, \
            ("Scan has returned a list of updates. This should not "
             "happen! Report this to theano-users (also include the "
@@ -1537,7 +1547,7 @@ def jacobian(expression, wrt, consider_constant=None,
 def hessian(cost, wrt, consider_constant=None,
-             disconnected_inputs='raise'):
+            disconnected_inputs='raise'):
    """
    :type cost: Scalar (0-dimensional) Variable.
    :type wrt: Vector (1-dimensional tensor) 'Variable' or list of

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            float * sm_data, int sms0, int sms1,
            float * am_data, int ams0)
        {
-            const int row = blockIdx.x;
+          for (int row = blockIdx.x; row < M; row += gridDim.x){
            const float * x = x_data + xs0 * row;
            const int y_idx = (int)y_idx_data[row * y_idxs0];
@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
                           + log(sum);
            }
            am_data[row*ams0] = row_max_j;
+          }
        }
        """
@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            }
        }
        {
-            int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
+            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
+                                    NUM_VECTOR_OP_BLOCKS);
     //TODO: launch more threads per row and do parallel sum and max reductions
            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(float);
@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            if (cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n",
+                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
-                             cudaGetErrorString(err));
+                             "The kernel was launched with %%d threads,"
+                             " %%d blocks and %%d shared memory\\n",
+                             cudaGetErrorString(err),
+                             n_threads, n_blocks, n_shared_bytes);
                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
    def c_code_cache_version(self):
        #return ()
-        return (3,)
+        return (4,)
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
    def c_code_cache_version(self):
        #return ()
-        return (5,)
+        return (6,)
    def c_code(self, node, nodename, inp, out, sub):
        dnll, sm, y_idx = inp
@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
            }
        }
        {
+            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
+                                    NUM_VECTOR_OP_BLOCKS);
+            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<
+                <<<n_blocks, n_threads>>>(
-                    CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                    std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
-                >>>(
                        CudaNdarray_HOST_DIMS(%(dx)s)[0],
                        CudaNdarray_HOST_DIMS(%(dx)s)[1],
@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
            if( cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n",
+                             "Cuda error: %%s: %%s.\\n"
+                             "The kernel was launched with %%d threads and"
+                             " %%d blocks\\n",
                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err));
+                             cudaGetErrorString(err), n_threads, n_blocks);
                %(fail)s;
            }
        }

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
    We check that we loop when their is too much threads
-    TODO: check that we loop when their is too much block(>32*1024)
    """
@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
    We check that we loop when their is too much threads
-    TODO: check that we loop when their is too much block(>32*1024)
    """
    n_in = 1000
    batch_size = 4097
    n_out = 1250
+    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
+        n_in = 4098
+        n_out = 4099
    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -715,10 +715,9 @@ class ExtractDiag(Op):
        implemented our own. """
        x, = ins
        z, = outs
        # zero-dimensional matrices ...
        if x.shape[0] == 0 or x.shape[1] == 0:
-            z[0] = numpy.zeros(0, dtype=x.dtype)
+            z[0] = node.outputs[0].type.value_zeros((0,))
            return
        if x.shape[0] < x.shape[1]:

--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
@@ -204,8 +204,8 @@ def test_rop_lop():
    rop_f = function([mx, mv], yv)
    sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
-                       sequences=tensor.arange(y.shape[0]),
+                        sequences=tensor.arange(y.shape[0]),
-                       non_sequences=[y, mx, mv])
+                        non_sequences=[y, mx, mv])
    scan_f = function([mx, mv], sy)
    rng = numpy.random.RandomState(utt.fetch_seed())
@@ -561,6 +561,7 @@ class test_Eigh(test_Eig):
 class test_Eigh_float32(test_Eigh):
    dtype = 'float32'
 def test_matrix_inverse_solve():
    if not imported_scipy:
        raise SkipTest("Scipy needed for the Solve op.")

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -17,7 +17,7 @@ from theano.sparse import enable_sparse
 from theano.gof.python25 import all, any, product
-if enable_sparse == False:
+if not enable_sparse:
    raise SkipTest('Optional package sparse disabled')
 from theano.sparse.basic import _is_dense, _is_sparse, _mtypes
@@ -35,7 +35,7 @@ from theano.sparse import (
    SparseFromDense,
    Cast, cast, HStack, VStack, AddSSData, add_s_s_data,
    structured_minimum, structured_maximum, structured_add,
-     mul_s_v, structured_add_s_v,
+    mul_s_v, structured_add_s_v,
    SamplingDot, sampling_dot,
    Diag, diag, SquareDiagonal, square_diagonal,
    EnsureSortedIndices, ensure_sorted_indices, clean,
@@ -372,17 +372,17 @@ class SparseInferShapeTester(utt.InferShapeTester):
                    [x, y],
                    [grads[0]],
                    [as_sparse_format(random_lil((4, 5),
-                                   config.floatX, 3), format),
+                                                 config.floatX, 3), format),
                     as_sparse_format(random_lil((5, 3),
-                                   config.floatX, 3), format)],
+                                                 config.floatX, 3), format)],
                    op)
            self._compile_and_check(
                    [x, y],
                    [grads[1]],
                    [as_sparse_format(random_lil((4, 5),
-                                   config.floatX, 3), format),
+                                                 config.floatX, 3), format),
                     as_sparse_format(random_lil((5, 3),
-                                   config.floatX, 3), format)],
+                                                 config.floatX, 3), format)],
                    op)
    def test_dense_from_sparse(self):
@@ -398,8 +398,7 @@ class SparseInferShapeTester(utt.InferShapeTester):
        self._compile_and_check([x],
                                [csc_from_dense(x)],
                                [numpy.random.randn(10, 40).astype(
-                    config.floatX)],
+                                    config.floatX)],
                                csc_from_dense.__class__)
    def test_sparse_from_list(self):
@@ -674,27 +673,27 @@ class test_csm_properties(unittest.TestCase):
    def test_csm_properties_grad(self):
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[0], [spmat],
-                    structured=True)
+                                   structured=True)
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[1], [spmat],
-                    structured=True)
+                                   structured=True)
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[2], [spmat],
-                    structured=True)
+                                   structured=True)
                verify_grad_sparse(lambda *x: CSMProperties()(*x)[2], [spmat],
-                    structured=True)
+                                   structured=True)
    def test_csm_properties(self):
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
@@ -717,7 +716,7 @@ class test_csm(unittest.TestCase):
    def test_csm_grad(self):
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
@@ -732,7 +731,7 @@ class test_csm(unittest.TestCase):
        Test support for gradients sparser than the input.
        """
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csc', 'csr']:
            for dtype in ['float32', 'float64']:
@@ -742,7 +741,7 @@ class test_csm(unittest.TestCase):
                s = tensor.ivector()
                a = as_sparse_variable(sp_types[format](random_lil((4, 3),
-                    dtype, 1)))
+                                                                   dtype, 1)))
                f = theano.function([x, y, z, s],
                                    tensor.grad(dense_from_sparse(
@@ -751,7 +750,7 @@ class test_csm(unittest.TestCase):
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                res = f(spmat.data, spmat.indices, spmat.indptr,
-                    numpy.asarray(spmat.shape, 'int32'))
+                        numpy.asarray(spmat.shape, 'int32'))
                assert len(spmat.data) == len(res)
@@ -760,7 +759,7 @@ class test_csm(unittest.TestCase):
        Test support for gradients of unsorted inputs.
        """
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csr', 'csc', ]:
            for dtype in ['float32', 'float64']:
@@ -773,7 +772,7 @@ class test_csm(unittest.TestCase):
                                      [1, 2, 1],
                                      [1, 2, 1],
                                      [1, 2, 1]],
-                    dtype=dtype)[range(4)]
+                                     dtype=dtype)[range(4)]
                # Make sure it's unsorted
                assert not a.has_sorted_indices
                a = as_sparse_variable(a)
@@ -782,14 +781,15 @@ class test_csm(unittest.TestCase):
                    dense_from_sparse(a * CSM(format)(x, y, z, s))), x))
                spmat = sp_types[format](random_lil((4, 3), dtype,
-                    12))[range(4)]
+                                                    12))[range(4)]
                assert not spmat.has_sorted_indices
                res = f(spmat.data, spmat.indices, spmat.indptr,
-                    numpy.asarray(spmat.shape, 'int32'))
+                        numpy.asarray(spmat.shape, 'int32'))
                col1 = sp_types[format]((res, spmat.indices, spmat.indptr),
-                    shape=numpy.asarray(spmat.shape, 'int32'))[:, 1].data
+                                        shape=numpy.asarray(spmat.shape,
+                                                            'int32'))[:, 1].data
                assert numpy.all(col1 == 2)
@@ -808,7 +808,7 @@ class test_csm(unittest.TestCase):
                spmat = sp_types[format](random_lil((4, 3), dtype, 3))
                res = f(spmat.data, spmat.indices, spmat.indptr,
-                    numpy.asarray(spmat.shape, 'int32'))
+                        numpy.asarray(spmat.shape, 'int32'))
                assert numpy.all(res.data == spmat.data)
                assert numpy.all(res.indices == spmat.indices)
@@ -909,8 +909,7 @@ class test_structureddot(unittest.TestCase):
        spmat = sp.csc_matrix(spmat)
        images = tensor.Tensor(dtype='float32',
-                               broadcastable=[False, False])(
+                               broadcastable=[False, False])('images')
-            'images')
        cscmat = CSC(kerns, spmat.indices[:spmat.size],
                     spmat.indptr, spmat.shape)
@@ -931,7 +930,8 @@ class test_structureddot(unittest.TestCase):
        #print 'type of kernvals = ', kernvals.dtype
        bsize = 3
        imvals = 1.0 * numpy.array(numpy.arange(bsize * spmat.shape[1]).\
-                reshape(bsize, spmat.shape[1]), dtype='float32')
+                                   reshape(bsize, spmat.shape[1]),
+                                   dtype='float32')
        outvals = f(kernvals, imvals)
        #print outvals
@@ -949,10 +949,10 @@ class test_structureddot(unittest.TestCase):
                f = theano.function([a, b], theano.Out(d, borrow=True))
                topo = f.maker.fgraph.toposort()
                for M, N, K, nnz in [(4, 3, 2, 3),
-                                  (40, 30, 20, 3),
+                                     (40, 30, 20, 3),
-                                  (40, 30, 20, 30),
+                                     (40, 30, 20, 30),
-                                  (400, 3000, 200, 6000),
+                                     (400, 3000, 200, 6000),
-                                  ]:
+                                 ]:
                    a_val = sp_mat[sparse_format_a](
                        random_lil((M, N), sparse_dtype, nnz))
                    b_val = sp_mat[sparse_format_b](
@@ -969,10 +969,10 @@ class test_structureddot(unittest.TestCase):
        f = theano.function([a, b], theano.Out(d, borrow=True))
        for M, N, K, nnz in [(4, 3, 2, 3),
-                (40, 30, 20, 3),
+                             (40, 30, 20, 3),
-                (40, 30, 20, 30),
+                             (40, 30, 20, 30),
-                (400, 3000, 200, 6000),
+                             (400, 3000, 200, 6000),
-                ]:
+                         ]:
            spmat = sp.csc_matrix(random_lil((M, N), sparse_dtype, nnz))
            mat = numpy.asarray(numpy.random.randn(N, K), dense_dtype)
            theano_times = []
@@ -1017,10 +1017,10 @@ class test_structureddot(unittest.TestCase):
        f = theano.function([a, b], d)
        for M, N, K, nnz in [(4, 3, 2, 3),
-                (40, 30, 20, 3),
+                             (40, 30, 20, 3),
-                (40, 30, 20, 30),
+                             (40, 30, 20, 30),
-                (400, 3000, 200, 6000),
+                             (400, 3000, 200, 6000),
-                ]:
+                         ]:
            spmat = sp.csr_matrix(random_lil((M, N), sparse_dtype, nnz))
            mat = numpy.asarray(numpy.random.randn(N, K), dense_dtype)
            t0 = time.time()
@@ -1062,7 +1062,7 @@ class DotTests(utt.InferShapeTester):
        self.v_10 = numpy.asarray(numpy.random.uniform(-1, 1, 10),
                                  dtype=theano.config.floatX)
        self.v_100 = numpy.asarray(numpy.random.uniform(-1, 1, 100),
-                                  dtype=theano.config.floatX)
+                                   dtype=theano.config.floatX)
    def test_csr_dense(self):
        x = theano.sparse.csr_matrix('x')
@@ -1143,7 +1143,7 @@ class DotTests(utt.InferShapeTester):
        a = sparse.csr_matrix('a', dtype='float32')
        b = cuda.float32_shared_constructor(
-                numpy.random.rand(3, 4).astype('float32'))
+            numpy.random.rand(3, 4).astype('float32'))
        d = sparse.dot(a, b)
        f = theano.function([a], d)
@@ -1281,7 +1281,7 @@ class UsmmTests(unittest.TestCase):
                             for node in topo]) == len(topo) - 5)
                new_topo = []
                for node in topo:
-                    if not (isinstance(node.op, tensor.Elemwise) and \
+                    if not (isinstance(node.op, tensor.Elemwise) and
                       isinstance(node.op.scalar_op,
                                  theano.scalar.basic.Cast)):
                        new_topo.append(node)
@@ -1360,8 +1360,8 @@ class test_zeros_like(unittest.TestCase):
        x = theano.sparse.csr_matrix()
        f = theano.function([x], theano.sparse.sp_zeros_like(x))
        vx = scipy.sparse.csr_matrix(numpy.asarray(
-                numpy.random.binomial(1, 0.5, (100, 100)),
+            numpy.random.binomial(1, 0.5, (100, 100)),
-                dtype=theano.config.floatX))
+            dtype=theano.config.floatX))
        fx = f(vx)
@@ -1571,7 +1571,7 @@ class SpSumTester(utt.InferShapeTester):
                                                      shape=(10, 10))
                z = theano.sparse.sp_sum(variable[0], axis=axis)
-                if axis == None:
+                if axis is None:
                    assert z.type.broadcastable == ()
                else:
                    assert z.type.broadcastable == (False, )
@@ -1951,24 +1951,26 @@ class Test_getitem(unittest.TestCase):
            # the [] shortcut for getitem.
            # x[a:b] is not accepted because we don't have sparse vectors
            self.assertRaises(NotImplementedError,
-                    x.__getitem__, (slice(a, b), c))
+                              x.__getitem__, (slice(a, b), c))
            # x[a:b:step, c:d] is not accepted because scipy silently drops
            # the step (!)
            self.assertRaises(ValueError,
-                    x.__getitem__, (slice(a, b, -1), slice(c, d)))
+                              x.__getitem__, (slice(a, b, -1), slice(c, d)))
            self.assertRaises(ValueError,
-                    x.__getitem__, (slice(a, b), slice(c, d, 2)))
+                              x.__getitem__, (slice(a, b), slice(c, d, 2)))
            # Advanced indexing is not supported
            self.assertRaises(ValueError,
-                    x.__getitem__, (tensor.ivector('l'), slice(a, b)))
+                              x.__getitem__,
+                              (tensor.ivector('l'), slice(a, b)))
            # Indexing with random things is not supported either
            self.assertRaises(ValueError,
-                    x.__getitem__, slice(tensor.fscalar('f'), None))
+                              x.__getitem__, slice(tensor.fscalar('f'), None))
            self.assertRaises(ValueError,
-                    x.__getitem__, (slice(None), slice([1, 3, 4], None)))
+                              x.__getitem__,
+                              (slice(None), slice([1, 3, 4], None)))
    def test_GetItemScalar(self):
        sparse_formats = ('csc', 'csr')
@@ -1981,7 +1983,7 @@ class Test_getitem(unittest.TestCase):
            n = 42
            vx = as_sparse_format(self.rng.binomial(1, 0.5, (97, 100)),
-                                 format).astype(theano.config.floatX)
+                                  format).astype(theano.config.floatX)
            f1 = theano.function([x, a, b], x[a, b])
            r1 = f1(vx, 10, 10)
@@ -2248,6 +2250,7 @@ def elemwise_checker(op, expected_f, gap=None, test_dtypes=None,
            else:
                self.gap_grad = gap
            # Ensure the test's name is correct.
+            utt.seed_rng()
            assert eval(self.__class__.__name__) is self.__class__
        def test_op(self):
@@ -2449,7 +2452,8 @@ TanTester = elemwise_checker(
 ArcsinTester = elemwise_checker(
    sparse.arcsin,
    numpy.arcsin,
-    gap=(-1, 1))
+    gap=(-1, 1),
+    gap_grad=(-0.99, 0.99))
 ArctanTester = elemwise_checker(
    sparse.arctan,
@@ -2501,7 +2505,7 @@ FloorTester = elemwise_checker(
    numpy.floor,
    grad_test=False,
    test_dtypes=[m for m in sparse.all_dtypes
-                 if not  m in sparse.complex_dtypes])
+                 if not m in sparse.complex_dtypes])
 Log1pTester = elemwise_checker(
    sparse.log1p,
@@ -2516,20 +2520,20 @@ Deg2radTester = elemwise_checker(
    sparse.deg2rad,
    numpy.deg2rad,
    test_dtypes=[m for m in sparse.all_dtypes
-                 if not  m in sparse.complex_dtypes])
+                 if not m in sparse.complex_dtypes])
 Rad2degTester = elemwise_checker(
    sparse.rad2deg,
    numpy.rad2deg,
    test_dtypes=[m for m in sparse.all_dtypes
-                 if not  m in sparse.complex_dtypes])
+                 if not m in sparse.complex_dtypes])
 TruncTester = elemwise_checker(
    sparse.trunc,
    numpy.trunc,
    test_dtypes=[m for m in sparse.all_dtypes
-                 if not  m in sparse.complex_dtypes])
+                 if not m in sparse.complex_dtypes])
 SqrTester = elemwise_checker(
@@ -2548,7 +2552,7 @@ class MulSVTester(unittest.TestCase):
    def test_mul_s_v_grad(self):
        sp_types = {'csc': sp.csc_matrix,
-            'csr': sp.csr_matrix}
+                    'csr': sp.csr_matrix}
        for format in ['csr', 'csc']:
            for dtype in ['float32', 'float64']:
@@ -2556,7 +2560,8 @@ class MulSVTester(unittest.TestCase):
                mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
                theano.sparse.verify_grad_sparse(mul_s_v,
-                    [spmat, mat], structured=True)
+                                                 [spmat, mat],
+                                                 structured=True)
    def test_mul_s_v(self):
        sp_types = {'csc': sp.csc_matrix,
@@ -2590,7 +2595,8 @@ class StructuredAddSVTester(unittest.TestCase):
                mat = numpy.asarray(numpy.random.rand(3), dtype=dtype)
                theano.sparse.verify_grad_sparse(structured_add_s_v,
-                    [spmat, mat], structured=True)
+                                                 [spmat, mat],
+                                                 structured=True)
    def test_structured_add_s_v(self):
        sp_types = {'csc': sp.csc_matrix,
@@ -2618,11 +2624,11 @@ class SamplingDotTester(utt.InferShapeTester):
    x.append(sparse.csr_matrix())
    #unsquare shape
    a = [numpy.array(numpy.random.random_integers(5, size=(4, 3)) - 1,
-                      dtype=theano.config.floatX),
+                     dtype=theano.config.floatX),
         numpy.array(numpy.random.random_integers(5, size=(5, 3)) - 1,
-                      dtype=theano.config.floatX),
+                     dtype=theano.config.floatX),
         numpy.array(numpy.random.random_integers(2, size=(4, 5)) - 1,
-                      dtype=theano.config.floatX)
+                     dtype=theano.config.floatX)
         ]
    a[2] = sp.csr_matrix(a[2])
@@ -2672,7 +2678,7 @@ test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
    ref_fct_=lambda a: numpy.asarray((a * 2).todense()),
    cast_value_=scipy.sparse.csr_matrix,
    name='test_shared_options',
-    )
+)
 if __name__ == '__main__':

--- a/theano/tensor/sort.py
+++ b/theano/tensor/sort.py
@@ -144,7 +144,17 @@ class ArgSortOp(theano.Op):
    def grad(self, inputs, output_grads):
        #No grad defined for intergers.
-        return [None, None]
+        inp, axis = inputs
+        inp_grad = theano.gradient.grad_not_implemented(
+            self, 0, axis,
+            "I'm not sure if argsort should have its gradient"
+            " implemented or is should be marked as undefined."
+            " So I mark it as not implemented for now.")
+        axis_grad = theano.gradient.grad_undefined(
+            self, 1, axis,
+            "argsort is not defined for non-integer axes so"
+            " argsort(x, axis+eps) is undefined")
+        return [inp_grad, axis_grad]
    """
    def R_op(self, inputs, eval_points):
        # R_op can receive None as eval_points.

--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -185,7 +185,9 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
                subprocess_extra_args.update(dict(
                    stdout=dummy_out.fileno(),
                    stderr=dummy_out.fileno()))
+            t0 = time.time()
            subprocess.call(cmd, **subprocess_extra_args)
+            t1 = time.time()
            # Recover failed test indices from the 'failed' field of the
            # '.noseids' file. We need to do it after each batch because
            # otherwise this field may get erased. We use a set because it
@@ -193,8 +195,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
            # to avoid duplicates.
            failed = failed.union(cPickle.load(open(noseids_file, 'rb'))
                                  ['failed'])
-            print '%s%% done (failed: %s)' % ((test_range[-1] * 100) //
+            print '%s%% done in %.3fs (failed: %s)' % (
-                                n_tests, len(failed))
+                (test_range[-1] * 100) // n_tests, t1 - t0, len(failed))
        # Sort for cosmetic purpose only.
        failed = sorted(failed)
        if failed: