Merge pull request #2229 from nouiz/mixed

Mixed

Merge pull request #2229 from nouiz/mixed
01232098 · abergeron · 45023052 · 5ea0dad4 · 01232098 · 01232098
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -3,8 +3,8 @@
 Easy Installation of an optimized Theano on Ubuntu
 ==================================================
-These instructions were written for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04
+These instructions were written for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04,
-and 13.10. You can probably do something similar on older versions.
+13.10 and 14.04.
 .. note::
@@ -148,8 +148,6 @@ Do like in the section "Updating Theano", but use
 ``git+git://github.com/Theano/Theano.git`` instead of ``theano``.
-.. _install_ubuntu_gpu:
 Manual Openblas instruction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -172,6 +170,8 @@ yourself. Here is some code that will help you.
    echo -e "\n[blas]\nldflags = -lopenblas\n" >> ~/.theanorc
+.. _install_ubuntu_gpu:
 Contributed GPU instruction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -188,7 +188,17 @@ Ubuntu 11.10/12.04 (probably work on 11.04 too):
 Then you need to fetch latest CUDA tool kit (download ubuntu 11.04 32/64bit package)
 from `here <http://developer.nvidia.com/cuda-downloads>`_.
-For the `run` installed (the only one available for CUDA 5.0 and older), you install it like this:
+Ubuntu 14.04:
+.. code-block:: bash
+   sudo apt-get install nvidia-current
+   sudo apt-get install nvidia-cuda-toolkit # As of October 31th, 2014, provide cuda 5.5, not the latest cuda 6.5
+If you want cuda 6.5, you can download packages from `nvidia
+<http://developer.nvidia.com/cuda-downloads>`_ for Ubuntu 14.04.
+If you downloaded the `run` package (the only one available for CUDA 5.0 and older), you install it like this:
 .. code-block:: bash
@@ -197,14 +207,20 @@ For the `run` installed (the only one available for CUDA 5.0 and older), you ins
 Since CUDA 5.5, Nvidia provide a DEB package. If you don't know how to
 intall it, just double click on it from the graphical interface. It
-should ask if you want to install it.
+should ask if you want to install it. On Ubuntu 14.04, you need to run
+this in your terminal:
+.. code-block:: bash
+    sudo apt-get update
+    sudo apt-get install cuda
 You must reboot the computer after the driver installation. To test
 that it was loaded correctly after the reboot, run the command
 `nvidia-smi` from the command line.
 You probably need to change the default version of gcc as
-`explained by Benjamin J. McCann <http://www.benmccann.com/blog/installing-cuda-and-theano/>`_:
+`explained by Benjamin J. McCann <http://www.benmccann.com/blog/installing-cuda-and-theano/>`_ if the package you downloaded is for another Ubuntu version:
@@ -226,14 +242,16 @@ Test GPU configuration
 .. note::
-   Ubuntu 10.04 LTS: default gcc version 4.4.3. gcc 4.1.2, 4.3.4 availables.
+   Ubuntu 10.04 LTS: default gcc version 4.4.3. gcc 4.1.2, 4.3.4 available.
+   Ubuntu 11.04: default gcc version 4.5.2. gcc 4.4.5 available.
-   Ubuntu 11.04: default gcc version 4.5.2. gcc 4.4.5 availables.
+   Ubuntu 11.10: default gcc version 4.6.1. gcc 4.4.6 and 4.5.3 available.
-   Ubuntu 11.10: default gcc version 4.6.1. gcc 4.4.6 and 4.5.3 availables.
+   Ubuntu 12.04 LTS: default gcc version 4.6.3. gcc 4.4.7 and 4.5.3 available.
-   Ubuntu 12.04 LTS: default gcc version 4.6.3. gcc 4.4.7 and 4.5.3 availables.
+   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 available.
-   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.
+   Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 available.
-   Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 availables.
+   Ubuntu 14.04: default gcc version 4.8.2, gcc 4.4.7,, 4.6.4, and 4.7.3 available.
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -48,6 +48,10 @@
      different memory layout, has shapes restrictions, but does not use
      extra memory and is faster then the legacy convolution.
+    If you want to verify the usage of cuDNN, you can use the Theano
+    flag ``optimizer_including=cudnn``. This will raise an error if we
+    can't use cuDNN.
 TODO: Give examples on how to use these things! They are pretty complicated.

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -1492,7 +1492,7 @@ class numeric_grad(object):
 def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
                out_type=None, abs_tol=None,
-                rel_tol=None, mode=None, cast_to_output_type=False):
+                rel_tol=None, mode=None, cast_to_output_type=True):
    """Test a gradient by Finite Difference Method. Raise error on failure.
    Example:
@@ -1517,7 +1517,8 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
        None is type-dependent)
        Raising the value of eps can raise or lower the absolute and
        relative errors of the verification depending on the
-        Op. Raising eps does not lower the verification quality. It
+        Op. Raising eps does not lower the verification quality
+        for linear operations. It
        is better to raise eps than raising abs_tol or rel_tol.
    :param out_type: dtype of output, if complex (i.e. 'complex32' or
        'complex64')
@@ -1525,6 +1526,9 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
        comparison
    :param rel_tol: relative tolerance used as threshold for gradient
        comparison
+    :param cast_to_output_type: if the output is float32 and
+        cast_to_output_type is True, cast the random projection to
+        float32. Otherwise it is float64.
    :note: WARNING to unit-test writers: if `op` is a function that builds
        a graph, try to make it a SMALL graph.  Often verify grad is run
@@ -1604,7 +1608,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    # otherwise too much precision is lost in numerical gradient
    def random_projection():
        plain = rng.rand(*o_fn_out.shape) + 0.5
-        if cast_to_output_type:
+        if cast_to_output_type and o_output.dtype == "float32":
            return numpy.array(plain, o_output.dtype)
        return plain

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -99,14 +99,17 @@ def debugprint(obj, depth=-1, print_type=False,
            order = obj.toposort()
        elif isinstance(obj, (int, long, float, numpy.ndarray)):
            print obj
+        elif isinstance(obj, (theano.In, theano.Out)):
+            results_to_print.append(obj.variable)
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)
    scan_ops = []
    for r in results_to_print:
-        #Add the parent scan op to the list as well
+        # Add the parent scan op to the list as well
-        if hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan):
+        if (hasattr(r.owner, 'op') and
+            isinstance(r.owner.op, theano.scan_module.scan_op.Scan)):
            scan_ops.append(r)
        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
@@ -120,19 +123,26 @@ def debugprint(obj, depth=-1, print_type=False,
        for s in scan_ops:
            print >> file, ""
-            debugmode.debugprint(s, depth=depth, done=done, print_type=print_type,
+            debugmode.debugprint(s, depth=depth, done=done,
+                                 print_type=print_type,
                                 file=_file, ids=ids,
                                 scan_ops=scan_ops, stop_on_name=stop_on_name)
+            if hasattr(s.owner.op, 'fn'):
-            for idx, i in enumerate(s.owner.op.outputs):
+                # If the op was compiled, print the optimized version.
+                outputs = s.owner.op.fn.maker.fgraph.outputs
+            else:
+                outputs = s.owner.op.output
+            for idx, i in enumerate(outputs):
                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
                        scan_ops.append(i)
-                debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done,
+                debugmode.debugprint(r=i, prefix=new_prefix,
+                                     depth=depth, done=done,
                                     print_type=print_type, file=file,
                                     ids=ids, stop_on_name=stop_on_name,
-                                     prefix_child=new_prefix_child, scan_ops=scan_ops)
+                                     prefix_child=new_prefix_child,
+                                     scan_ops=scan_ops)
    if file is _file:
        return file
@@ -263,10 +273,10 @@ class OperatorPrinter:
            if (self.assoc == 'left' and i != 0 or self.assoc == 'right'
                and i != max_i):
                s = pprinter.process(input, pstate.clone(
-                        precedence=self.precedence + 1e-6))
+                    precedence=self.precedence + 1e-6))
            else:
                s = pprinter.process(input, pstate.clone(
-                        precedence=self.precedence))
+                    precedence=self.precedence))
            input_strings.append(s)
        if len(input_strings) == 1:
            s = self.operator + input_strings[0]
@@ -321,8 +331,8 @@ class FunctionPrinter:
        idx = node.outputs.index(output)
        name = self.names[idx]
        return "%s(%s)" % (name, ", ".join(
-                [pprinter.process(input, pstate.clone(precedence=-1000))
+            [pprinter.process(input, pstate.clone(precedence=-1000))
-                 for input in node.inputs]))
+             for input in node.inputs]))
 class MemberPrinter:
@@ -368,8 +378,8 @@ class DefaultPrinter:
        if node is None:
            return LeafPrinter().process(r, pstate)
        return "%s(%s)" % (str(node.op), ", ".join(
-                [pprinter.process(input, pstate.clone(precedence=-1000))
+            [pprinter.process(input, pstate.clone(precedence=-1000))
-                 for input in node.inputs]))
+             for input in node.inputs]))
 class LeafPrinter:
@@ -436,7 +446,7 @@ class PPrinter:
                if output in inv_updates:
                    name = str(inv_updates[output])
                    strings.append((i + 1000, "%s <- %s" % (
-                                name, pprinter.process(output))))
+                        name, pprinter.process(output))))
                    i += 1
                if output.name is not None or output in outputs:
                    if output.name is None:
@@ -508,13 +518,13 @@ Print to the terminal a math-like expression.
 # colors not used: orange, amber#FFBF00, purple, pink,
 # used by default: green, blue, grey, red
 default_colorCodes = {'GpuFromHost': 'red',
-              'HostFromGpu': 'red',
+                      'HostFromGpu': 'red',
-              'Scan': 'yellow',
+                      'Scan': 'yellow',
-              'Shape': 'cyan',
+                      'Shape': 'cyan',
-              'IfElse': 'magenta',
+                      'IfElse': 'magenta',
-              'Elemwise': '#FFAABB',  # dark pink
+                      'Elemwise': '#FFAABB',  # dark pink
-              'Subtensor': '#FFAAFF',  # purple
+                      'Subtensor': '#FFAAFF',  # purple
-              'Alloc': '#FFAA22'}  # orange
+                      'Alloc': '#FFAA22'}  # orange
 def pydotprint(fct, outfile=None,
@@ -526,8 +536,7 @@ def pydotprint(fct, outfile=None,
               assert_nb_all_strings=-1,
               return_image=False,
               ):
-    """
+    """Print to a file (png format) the graph of a compiled theano function's ops.
-    Print to a file (png format) the graph of a compiled theano function's ops.
    :param fct: a compiled Theano function, a Variable, an Apply or
                a list of Variable.
@@ -587,6 +596,11 @@ def pydotprint(fct, outfile=None,
    red ellipses are transfers from/to the gpu (ops with names GpuFromHost,
    HostFromGpu).
+    .. note::
+        Since October 20th, 2014, this print the inner function of all
+        scan separately after the top level debugprint output.
    """
    if colorCodes is None:
        colorCodes = default_colorCodes
@@ -623,7 +637,7 @@ def pydotprint(fct, outfile=None,
        topo = fct.toposort()
    if not pydot_imported:
        raise RuntimeError("Failed to import pydot. You must install pydot"
-                            " for `pydotprint` to work.")
+                           " for `pydotprint` to work.")
        return
    g = pd.Dot()
@@ -686,8 +700,8 @@ def pydotprint(fct, outfile=None,
                varstr = (input_update[var].variable.name + " UPDATE "
                          + str(var.type))
        else:
-            #a var id is needed as otherwise var with the same type will be
+            # a var id is needed as otherwise var with the same type will be
-            #merged in the graph.
+            # merged in the graph.
            varstr = str(var.type)
        if (varstr in all_strings) or with_ids:
            idx = ' id=' + str(len(var_str))
@@ -716,7 +730,7 @@ def pydotprint(fct, outfile=None,
        prof_str = ''
        if mode:
            time = mode.profile_stats[fct].apply_time.get(node, 0)
-            #second, % total time in profiler, %fct time in profiler
+            # second, % total time in profiler, %fct time in profiler
            if mode.local_time == 0:
                pt = 0
            else:
@@ -728,7 +742,7 @@ def pydotprint(fct, outfile=None,
            prof_str = '   (%.3fs,%.3f%%,%.3f%%)' % (time, pt, pf)
        elif profile:
            time = profile.apply_time.get(node, 0)
-            #second, %fct time in profiler
+            # second, %fct time in profiler
            if profile.fct_callcount == 0:
                pf = 0
            else:
@@ -778,7 +792,7 @@ def pydotprint(fct, outfile=None,
            nw_node = pd.Node(astr, shape=apply_shape)
        elif high_contrast:
            nw_node = pd.Node(astr, style='filled', fillcolor=use_color,
-                               shape=apply_shape)
+                              shape=apply_shape)
        else:
            nw_node = pd.Node(astr, color=use_color, shape=apply_shape)
        g.add_node(nw_node)
@@ -809,7 +823,7 @@ def pydotprint(fct, outfile=None,
            elif var.name or not compact:
                g.add_edge(pd.Edge(varstr, astr, label=label))
            else:
-                #no name, so we don't make a var ellipse
+                # no name, so we don't make a var ellipse
                g.add_edge(pd.Edge(apply_name(var.owner), astr, label=label))
        for id, var in enumerate(node.outputs):
@@ -892,7 +906,7 @@ def pydotprint_variables(vars,
    '''
    warnings.warn("pydotprint_variables() is deprecated."
-                 " Use pydotprint() instead.")
+                  " Use pydotprint() instead.")
    if colorCodes is None:
        colorCodes = default_colorCodes
@@ -976,7 +990,7 @@ def pydotprint_variables(vars,
                    g.add_node(pd.Node(varastr))
                elif high_contrast:
                    g.add_node(pd.Node(varastr, style='filled',
-                                        fillcolor='green'))
+                                       fillcolor='green'))
                else:
                    g.add_node(pd.Node(varastr, color='green'))
            else:

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -567,12 +567,28 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
 PyObject* CudaNdarray_ZEROS(int n, int * dims)
 {
-    int total_elements = 1;
+    size_t total_elements = 1;
-    for(int i=0;i<n;i++)
+    for(size_t i=0;i<n;i++){
+        // Detect overflow on unsigned integer
+        if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "Can't store in size_t for the bytes requested %llu * %llu",
+                         (unsigned long long)total_elements,
+                         (unsigned long long)dims[i]);
+            return NULL;
+        }
        total_elements*=dims[i];
+    }
    // total_elements now contains the size of the array, in reals
-    int total_size = total_elements * sizeof(real);
+    if (total_elements > (SIZE_MAX / sizeof(real))){
+        PyErr_Format(PyExc_RuntimeError,
+                     "Can't store in size_t for the bytes requested %llu * 4",
+                     (unsigned long long)total_elements);
+        return NULL;
+    }
+    size_t total_size = total_elements * sizeof(real);
    CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
    if (!rval)
@@ -592,7 +608,9 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
    //fprintf(stdout, "Sizeof: %d\n", total_size);
    if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
    {
-        PyErr_Format(PyExc_MemoryError, "CudaNdarray_ZEROS: Error memsetting %d bytes of device memory.", total_size);
+        PyErr_Format(PyExc_MemoryError,
+                     "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
+                     (unsigned long long)total_size);
        Py_DECREF(rval);
        return NULL;
    }
@@ -1272,8 +1290,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    if (cpu_err_var != 0) {
        PyErr_Format(
            PyExc_IndexError,
-            "Cuda error: %s: The error code on the gpu is %i.\n",
+            "CudaNdarray_TakeFrom: One of the index value is out of bound.\n",
-            "CudaNdarray_TakeFrom",
            cpu_err_var);
        // Must reset it to 0 to don't reset it before each use.
        err = cudaMemset((void*)err_var, 0, sizeof(int));

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -378,8 +378,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
            //Detect overflow on unsigned integer
            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested %llu",
+                             "Can't store in size_t for the bytes requested %llu * %llu",
-                             (unsigned long long)size);
+                             (unsigned long long)size, (unsigned long long)dim[i]);
                return -1;
            }
            size = size * dim[i];
@@ -395,7 +395,7 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
            //Detect overflow on unsigned integer
            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested %llu",
+                             "Can't store in size_t for the bytes requested %llu * 4",
                             (unsigned long long)size);
                return -1;
            }
@@ -403,6 +403,14 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
        }
    }
+    // Detect overflow on unsigned integer
+    if (size > (SIZE_MAX / sizeof(real))) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Can't store in size_t for the bytes requested %llu",
+                     (unsigned long long)size);
+        return -1;
+    }
    // If the allocated buffer is already of the right size, we don't need to
    // do anything else.
    // Note: self->data_allocated is 0 for a view, so views will fail this

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
-import copy
 import os
 import theano
 from theano import Apply, tensor
 from theano.gof.type import CDataType
 from theano.compat import PY3
-from theano.compat.six import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import (GpuOp, cuda_available, active_device_number,
                                 device_properties)
@@ -56,6 +54,14 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
        """ % dict(var=var, err=err, desc=desc, fail=fail)
+def raise_no_dnn():
+    """ Raise a RuntimeError if cudnn can't be used"""
+    if not dnn_available():
+        raise RuntimeError(
+            "cuDNN optimization was enabled, but cuDNN is not available. " +
+            dnn_available.msg)
 class DnnBase(GpuOp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -113,9 +119,9 @@ class GpuDnnConvDesc(GpuOp):
        self.conv_mode = conv_mode
    def make_node(self, img_shape, kern_shape):
-        if img_shape.type.ndim != 1 and img_shape.type.dtype != numpy.int64:
+        if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
            raise TypeError('img must be 1D shape tensor')
-        if kern_shape.type.ndim != 1 and kern_shape.type.dtype != numpy.int64:
+        if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
            raise TypeError('kern must be 1D shape tensor')
        return Apply(self, [img_shape, kern_shape],
@@ -918,11 +924,11 @@ err%(name)s = cudnnSoftmaxForward(
 # We need this since other stuff from opt is not importable.
 if cuda_available:
-    from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
+    from theano.sandbox.cuda.opt import local_optimizer, gpu_optimizer
-                                         gpu_optimizer)
    @local_optimizer([GpuConv])
    def local_conv_dnn(node):
+        raise_no_dnn()
        if isinstance(node.op, GpuConv):
            if node.op.border_mode not in ['full', 'valid']:
                return
@@ -965,6 +971,7 @@ if cuda_available:
    @local_optimizer([GpuSoftmax])
    def local_softmax_dnn(node):
+        raise_no_dnn()
        if isinstance(node.op, GpuSoftmax):
            ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
            out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(gpu_contiguous(ins))

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -25,12 +25,6 @@ from theano.sparse.type import SparseType, _is_sparse
 sparse_formats = ['csc', 'csr']
-# TODO: move this decorator to the compile submodule
-def register_specialize(lopt, *tags, **kwargs):
-    compile.optdb['specialize'].register((kwargs and kwargs.pop('name')) or
-                                         lopt.__name__, lopt, 'fast_run',
-                                         *tags)
 """ Types of sparse matrices to use for testing """
 _mtypes = [scipy.sparse.csc_matrix, scipy.sparse.csr_matrix]
 #_mtypes = [sparse.csc_matrix, sparse.csr_matrix, sparse.dok_matrix,
@@ -2686,7 +2680,7 @@ class HStack(gof.op.Op):
                         for i in range(len(inputs))]
        if _is_sparse_variable(gz):
-            gz = DenseFromSparse()(gz)
+            gz = dense_from_sparse(gz)
        split = tensor.Split(len(inputs))(gz, 1,
                                          tensor.stack(
@@ -2753,7 +2747,7 @@ class VStack(HStack):
                        for i in range(len(inputs))]
        if _is_sparse_variable(gz):
-            gz = DenseFromSparse()(gz)
+            gz = dense_from_sparse(gz)
        split = tensor.Split(len(inputs))(gz, 0,
                                          tensor.stack(

--- a/theano/sparse/opt.py
+++ b/theano/sparse/opt.py
@@ -6,8 +6,8 @@ import scipy
 import theano
 from theano import gof, scalar, tensor
 from theano.tensor import blas
+from theano.tensor.opt import register_specialize, register_canonicalize
 from theano.sparse import (CSC, CSR, csm_properties,
-                           register_specialize,
                           csm_grad, usmm, csm_indices, csm_indptr,
                           csm_data)
 from theano.sparse import basic as sparse
@@ -29,7 +29,7 @@ def local_csm_properties_csm(node):
            return ret_var
    return False
-sparse.register_specialize(local_csm_properties_csm)
+register_specialize(local_csm_properties_csm)
 # This is tested in tests/test_basic.py:test_remove0
@@ -177,6 +177,16 @@ theano.compile.optdb.register('local_inplace_addsd_ccode',
                              60, 'fast_run', 'inplace')
+@register_canonicalize("fast_compile")
+@register_specialize
+@gof.local_optimizer([sparse.DenseFromSparse])
+def local_dense_from_sparse_sparse_from_dense(node):
+    if isinstance(node.op, sparse.DenseFromSparse):
+        inp = node.inputs[0]
+        if inp.owner and isinstance(inp.owner.op, sparse.SparseFromDense):
+            return inp.owner.inputs
 @gof.local_optimizer([sparse.AddSD])
 def local_addsd_ccode(node):
    """
@@ -861,7 +871,7 @@ def local_usmm_csx(node):
                return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr,
                                       x_nsparse, y, z)]
    return False
-sparse.register_specialize(local_usmm_csx, 'cxx_only')
+register_specialize(local_usmm_csx, 'cxx_only')
 class CSMGradC(gof.Op):
@@ -1272,7 +1282,7 @@ def local_mul_s_d(node):
                    sparse.csm_shape(svar))]
    return False
-sparse.register_specialize(local_mul_s_d, 'cxx_only')
+register_specialize(local_mul_s_d, 'cxx_only')
 class MulSVCSR(gof.Op):
@@ -1414,7 +1424,7 @@ def local_mul_s_v(node):
        return [CSx(c_data, s_ind, s_ptr, s_shape)]
    return False
-sparse.register_specialize(local_mul_s_v, 'cxx_only')
+register_specialize(local_mul_s_v, 'cxx_only')
 class StructuredAddSVCSR(gof.Op):
@@ -1573,7 +1583,7 @@ def local_structured_add_s_v(node):
        return [CSx(c_data, s_ind, s_ptr, s_shape)]
    return False
-sparse.register_specialize(local_structured_add_s_v, 'cxx_only')
+register_specialize(local_structured_add_s_v, 'cxx_only')
 class SamplingDotCSR(gof.Op):
@@ -1822,6 +1832,6 @@ def local_sampling_dot_csr(node):
            return [sparse.CSR(z_data, z_ind, z_ptr, p_shape)]
    return False
-sparse.register_specialize(local_sampling_dot_csr,
+register_specialize(local_sampling_dot_csr,
-                           'cxx_only',
+                    'cxx_only',
-                           name='local_sampling_dot_csr')
+                    name='local_sampling_dot_csr')
--- a/theano/sparse/sandbox/sp2.py
+++ b/theano/sparse/sandbox/sp2.py
@@ -2,11 +2,11 @@ import numpy
 import scipy.sparse
 from theano import gof, tensor
+from theano.tensor.opt import register_specialize
 from theano.sparse.basic import (
    as_sparse_variable, SparseType, add_s_s, neg,
    mul_s_s, mul_s_d, dot,
-    CSMProperties, CSM, register_specialize,
+    CSMProperties, CSM,
    _is_sparse_variable, _is_dense_variable, CSC, CSR,
    csm_properties, csm_data, csm_indices, csm_indptr, csm_shape,
    _is_sparse,
@@ -122,8 +122,9 @@ class Binomial(gof.op.Op):
        n = tensor.as_tensor_variable(n)
        p = tensor.as_tensor_variable(p)
        shape = tensor.as_tensor_variable(shape)
-        return gof.Apply(self, [n, p, shape], [SparseType(dtype=self.dtype,
+        return gof.Apply(self, [n, p, shape],
-                                 format=self.format).make_variable()])
+                         [SparseType(dtype=self.dtype,
+                                     format=self.format).make_variable()])
    def perform(self, node, (n, p, shape, ), (out, )):
        binomial = numpy.random.binomial(n, p, size=shape)

--- a/theano/sparse/sandbox/test_sp.py
+++ b/theano/sparse/sandbox/test_sp.py
@@ -138,7 +138,7 @@ class TestSP(unittest.TestCase):
        # fixed parameters
        bsize = 10     # batch size
-        imshp = (28,28)
+        imshp = (8, 8)
        kshp = (5,5)
        nkern = 1 # per output pixel
        ssizes = ((1,1),(2,2))
@@ -151,8 +151,8 @@ class TestSP(unittest.TestCase):
        rng = numpy.random.RandomState(3423489)
        import theano.gof as gof
-        #Mode(optimizer='fast_run', linker=gof.OpWiseCLinker(allow_gc=False)),):
-        for mode in ('FAST_COMPILE','FAST_RUN'): #,profmode):
+        for mode in (None,):
            ntot, ttot = 0,0
            for conv_mode in convmodes:
                for ss in ssizes:

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -2322,7 +2322,7 @@ class CastTester(utt.InferShapeTester):
                    eps = None
                    if o_dtype == 'float32':
-                        eps = 7e-4
+                        eps = 1e-2
                    verify_grad_sparse(Cast(o_dtype), data, eps=eps)
@@ -2336,8 +2336,7 @@ def _format_info(nb):
        spa = getattr(sp, format + '_matrix')
        x[format] = [variable() for t in range(nb)]
-        mat[format] = [spa(numpy.random.random_integers(5, size=(3, 4)) - 1,
+        mat[format] = [spa(random_lil((3, 4), theano.config.floatX, 8))
-                           dtype=theano.config.floatX)
                       for t in range(nb)]
    return x, mat
@@ -2386,7 +2385,8 @@ class _HVStackTester(utt.InferShapeTester):
                        self.op_class(format=out_f, dtype=dtype),
                        self.mat[format],
                        structured=False,
-                        eps=7e-4)
+                        eps=1e-2,
+                        )
 def _hv_switch(op, expected_function):

--- a/theano/sparse/tests/test_opt.py
+++ b/theano/sparse/tests/test_opt.py
@@ -139,3 +139,17 @@ def test_local_sampling_dot_csr():
            # be inserted
            assert not any(isinstance(node.op, sparse.opt.SamplingDotCSR) for node
                       in f.maker.fgraph.toposort())
+def test_local_dense_from_sparse_sparse_from_dense():
+    mode = theano.compile.mode.get_default_mode()
+    mode = mode.including("local_dense_from_sparse_sparse_from_dense")
+    m = theano.tensor.matrix()
+    for op in [theano.sparse.csr_from_dense, theano.sparse.csc_from_dense]:
+        s = op(m)
+        o = theano.sparse.dense_from_sparse(s)
+        f = theano.function([m], o, mode=mode)
+        # We should just have a deep copy.
+        assert len(f.maker.fgraph.apply_nodes) == 1
+        f([[1, 2], [3, 4]])
--- a/theano/sparse/tests/test_type.py
+++ b/theano/sparse/tests/test_type.py
+def test_sparse_type():
+    import theano.sparse
+    # They need to be available even if scipy is not available.
+    assert hasattr(theano.sparse, "SparseType")
--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -10,6 +10,7 @@ from theano import gof, scalar
 from theano.gradient import DisconnectedType
 tensor = basic
 class CumsumOp(theano.Op):
    # See function cumsum for docstring
    def __init__(self, axis=None):
@@ -170,10 +171,11 @@ class CumprodOp(theano.Op):
        # We need to reverse the gradients along ``self.axis``,
        #  compute cumsum, then reverse again
-        reverse_slicing = [slice(None,None,None)] * gi.ndim
+        reverse_slicing = [slice(None, None, None)] * gi.ndim
-        reverse_slicing[self.axis] = slice(None,None,-1)
+        reverse_slicing[self.axis] = slice(None, None, -1)
        reverse_slicing = tuple(reverse_slicing)
-        return [cumsum((fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x]
+        return [cumsum((fx * gi)[reverse_slicing],
+                       self.axis)[reverse_slicing] / x]
    def infer_shape(self, node, shapes):
        if self.axis is None:
@@ -845,18 +847,17 @@ class FillDiagonalOffset(gof.Op):
        neg_offset_flag = basic.lt(offset, 0)
        min_wh = basic.minimum(width, height)
-        start = offset * pos_offset_flag + offset_abs * width \
+        start = offset * pos_offset_flag + offset_abs * width * neg_offset_flag
-                 * neg_offset_flag
+        num_of_step = basic.minimum(min_wh, width * pos_offset_flag +
-        num_of_step = basic.minimum( min_wh, width * pos_offset_flag
+                                    height * neg_offset_flag - offset_abs)
-                    + height * neg_offset_flag - offset_abs )   
        step = a.shape[1] + 1
        end = start + step * num_of_step
        # input of slice should be integer
-        start = basic.cast(start,'int32')
+        start = basic.cast(start, 'int32')
-        step = basic.cast(step,'int32')
+        step = basic.cast(step, 'int32')
-        end = basic.cast(end,'int32')
+        end = basic.cast(end, 'int32')
        wr_val = grad.flatten()[start:end:step].sum()
@@ -865,10 +866,11 @@ class FillDiagonalOffset(gof.Op):
            "offset is not defined for non-integer offset so"
            " fill_diagonal_offset(a,val,offset+eps) is undefined")
-        return [wr_a, wr_val,wr_offset]
+        return [wr_a, wr_val, wr_offset]
 fill_diagonal_offset_ = FillDiagonalOffset()
 def fill_diagonal_offset(a, val, offset):
    """
    Returns a copy of an array with all
@@ -885,3 +887,22 @@ def fill_diagonal_offset(a, val, offset):
    """
    return fill_diagonal_offset_(a, val, offset)
+def to_one_hot(y, nb_class, dtype=None):
+    """Return a matrix where each row correspond to the one hot
+    encoding of each element in y.
+        :param y: A vector of integer value between 0 and nb_class - 1.
+        :param nb_class: The number of class in y.
+        :param dtype: The dtype of the returned matrix. Default floatX.
+        :return: A matrix of shape (y.shape[0], nb_class), where each
+          row ``i`` is the one hot encoding of the corresponding ``y[i]``
+          value.
+   """
+    ret = theano.tensor.zeros((y.shape[0], nb_class),
+                              dtype=dtype)
+    ret = theano.tensor.set_subtensor(ret[theano.tensor.arange(y.shape[0]), y],
+                                      1)
+    return ret
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -13,13 +13,13 @@ from theano.tensor.basic import _allclose, NotScalarConstantError
 class TestConv2D(utt.InferShapeTester):
    mode = None
-    dtype = 'float64'
+    dtype = theano.config.floatX
    def setUp(self):
        super(TestConv2D, self).setUp()
-        self.input = T.dtensor4('input')
+        self.input = T.tensor4('input', dtype=self.dtype)
        self.input.name = 'default_V'
-        self.filters = T.dtensor4('filters')
+        self.filters = T.tensor4('filters', dtype=self.dtype)
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("conv2d tests need SciPy or a c++ compiler")

--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
@@ -117,4 +117,4 @@ def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
-    utt.verify_grad(conv3d, [signals, filters])
+    utt.verify_grad(conv3d, [signals, filters], eps=1e-1)
--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -309,8 +309,7 @@ class DownsampleFactorMaxGrad(Op):
                        zj = j // ds1
                        if (maxout[n, k, zi, zj] == x[n, k, i, j]):
                            gx[n, k, i, j] = gz[n, k, zi, zj]
-                        else:
+                        # No else clause needed as it is allocated with zeros
-                            gx[n, k, i, j] = 0
        gx_stg[0] = gx
    def infer_shape(self, node, in_shapes):

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -3277,20 +3277,24 @@ class T_Join_and_Split(unittest.TestCase):
        self.assertTrue((out == want).all())
    def test_join_matrix1(self):
-        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='float32')
+        av = numpy.array([[.1, .2, .3], [.4, .5, .6]], dtype='float32')
-        bv = numpy.array([[7], [8]], dtype='float32')
+        bv = numpy.array([[.7], [.8]], dtype='float32')
        a = self.shared(av)
        b = as_tensor_variable(bv)
        s = join(1, a, b)
-        want = numpy.array([[1, 2, 3, 7], [4, 5, 6, 8]], dtype='float32')
+        want = numpy.array([[.1, .2, .3, .7], [.4, .5, .6, .8]],
+                           dtype='float32')
        out = self.eval_outputs_and_check_join([s])
        self.assertTrue((out == want).all())
-#        assert tensor.grad(join(1,a,b), a
        utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
-                        eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
+                        mode=self.mode)
    def test_join_matrix_dtypes(self):
+        if "float32" in self.shared.__name__:
+            raise SkipTest(
+                "The shared variable constructor"
+                " need to support other dtype then float32")
        # Test mixed dtype. There was a bug that caused crash in the past.
        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='int8')
        bv = numpy.array([[7], [8]], dtype='float32')
@@ -3304,9 +3308,13 @@ class T_Join_and_Split(unittest.TestCase):
        grad(s.sum(), b)
        grad(s.sum(), a)
        utt.verify_grad(lambda b: join(1, a, b), [bv],
-                        eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
+                        eps=1.0e-2, mode=self.mode)
    def test_join_matrix_ints(self):
+        if "float32" in self.shared.__name__:
+            raise SkipTest(
+                "The shared variable constructor"
+                " need to support other dtype then float32")
        # Test mixed dtype. There was a bug that caused crash in the past.
        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='int8')
        bv = numpy.array([[7], [8]], dtype='int32')
@@ -3331,20 +3339,21 @@ class T_Join_and_Split(unittest.TestCase):
        self.assertTrue((out == want).all())
    def test_join_matrix1_using_horizontal_stack(self):
-        av = numpy.array([[1, 2, 3], [4, 5, 6]], dtype='float32')
+        av = numpy.array([[.1, .2, .3], [.4, .5, .6]], dtype='float32')
-        bv = numpy.array([[7], [8]], dtype='float32')
+        bv = numpy.array([[.7], [.8]], dtype='float32')
-        cv = numpy.array([[3, 2, 1], [6, 5, 4]], dtype='float32')
+        cv = numpy.array([[.3, .2, .1], [.6, .5, .4]], dtype='float32')
        a = self.shared(av)
        b = as_tensor_variable(bv)
        c = as_tensor_variable(cv)
        s = horizontal_stack(a, b, c)
-        want = numpy.array([[1, 2, 3, 7, 3, 2, 1], [4, 5, 6, 8, 6, 5, 4]],
+        want = numpy.array([[.1, .2, .3, .7, .3, .2, .1],
+                            [.4, .5, .6, .8, .6, .5, .4]],
                           dtype='float32')
        out = self.eval_outputs_and_check_join([s])
        self.assertTrue((out == want).all())
        utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
-                        eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
+                        mode=self.mode)
    def test_join_matrixV(self):
        """variable join axis"""

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -571,7 +571,7 @@ class test_Prod(unittest.TestCase):
        # including zeros, as the case with zeros is important
        # (and special cases: 1 zero in the row, more than 1 zero in the row)
-        x_val = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        x_val = numpy.asarray([[.1, .2, .3], [.4, .5, .6], [.7, .8, .9]],
                              dtype='float32')
        # now with verify_grad
        unittest_tools.verify_grad(Prod(axis=1), [x_val], mode=self.mode)

--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
@@ -7,9 +7,11 @@ from theano.tests import unittest_tools as utt
 from theano.tensor.extra_ops import (CumsumOp, cumsum, CumprodOp, cumprod,
                                     BinCountOp, bincount, DiffOp, diff,
-                                     squeeze, RepeatOp, repeat, Bartlett, bartlett,
+                                     squeeze, RepeatOp, repeat,
-                                     FillDiagonal, fill_diagonal, FillDiagonalOffset,
+                                     Bartlett, bartlett,
-                                     fill_diagonal_offset)
+                                     FillDiagonal, fill_diagonal,
+                                     FillDiagonalOffset, fill_diagonal_offset,
+                                     to_one_hot)
 from theano import tensor as T
 from theano import config, tensor, function
@@ -529,3 +531,30 @@ class TestFillDiagonalOffset(utt.InferShapeTester):
                                     test_offset],
                                     self.op_class )
+def test_to_one_hot():
+    v = theano.tensor.ivector()
+    o = to_one_hot(v, 10)
+    f = theano.function([v], o)
+    out = f([1, 2, 3, 5, 6])
+    assert out.dtype == theano.config.floatX
+    assert numpy.allclose(
+        out,
+        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
+         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
+         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
+         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
+         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])
+    v = theano.tensor.ivector()
+    o = to_one_hot(v, 10, dtype="int32")
+    f = theano.function([v], o)
+    out = f([1, 2, 3, 5, 6])
+    assert out.dtype == "int32"
+    assert numpy.allclose(
+        out,
+        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
+         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
+         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
+         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
+         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])