Merge pull request #1394 from nouiz/canopy

Canopy/GpuMin/compiledir_format/ScalarOp.c_support_code/...

Merge pull request #1394 from nouiz/canopy
297f927d · lamblin · c9d1922b · d11487ae · 297f927d · 297f927d
--- a/bin/theano-nose
+++ b/bin/theano-nose
@@ -106,7 +106,7 @@ def main():
                'Use --without-knownfailure to disable this warning.')
    else:
        sys.argv.remove('--without-knownfailure')
    # When 'theano-nose' is called-back under the time-profile option, an
    # instance of the custom Nosetests plugin class 'DisabDocString' (see
    # below) is loaded. The latter ensures that the test name will not be
@@ -114,7 +114,22 @@ def main():
    if '--disabdocstring' in sys.argv:
        addplugins.append(DisabDocString())
-    return nose.main(addplugins=addplugins)
+    try:
+        if addplugins:
+            ret = nose.main(addplugins=addplugins)
+        else:
+            ret = nose.main()
+        return ret
+    except TypeError, e:
+        if "got an unexpected keyword argument 'addplugins'" in e.message:
+            # This means nose is too old and does not support plugins
+            _logger.warn(
+                'KnownFailure plugin from NumPy can\'t'
+                ' be used as nosetests is too old. '
+                'Use --without-knownfailure to disable this warning.')
+            nose.main()
+        else:
+            raise
 def help():

--- a/doc/index.txt
+++ b/doc/index.txt
@@ -57,11 +57,20 @@ directory, so that when you pull updates via Git, they will be
 automatically reflected the "installed" version. For more information about
 installation and configuration, see :ref:`installing Theano <install>`.
-Master Tests Status:
+Status
+======
 .. image:: https://secure.travis-ci.org/Theano/Theano.png?branch=master
    :target: http://travis-ci.org/Theano/Theano/builds
+.. image:: https://pypip.in/v/Theano/badge.png
+    :target: https://crate.io/packages/Theano/
+    :alt: Latest PyPI version
+.. image:: https://pypip.in/d/Theano/badge.png
+    :target: https://crate.io/packages/Theano/
+    :alt: Number of PyPI downloads
 .. _available on PyPI: http://pypi.python.org/pypi/Theano
 .. _Related Projects: https://github.com/Theano/Theano/wiki/Related-projects

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -150,9 +150,8 @@ directory; see the `virtualenv documentation`_ for details.
 .. note::
-    Theano *can* be installed with easy_install_, however we recommend pip_ as
+    Theano *can* be installed with easy_install_, however we recommend pip_.
-    a long-standing bug in ``easy_install`` prevents ``theano.test()`` from
+    ``pip`` offers many benefits over
-    running the Theano test suite; ``pip`` offers many other benefits over
    ``easy_install`` such as more intelligent dependency management, better
    error messages and a ``pip uninstall`` command for easily removing
    packages.
@@ -533,6 +532,7 @@ it differently and it worked, please let us know the details on the
 `theano-users`_ mailing-list, so that we can add alternate instructions
 here.
 In academia: Enthought Python Distribution (EPD)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -798,12 +798,36 @@ pip is not included in EPD, but you can simply install it with::
 You can then proceed to the :ref:`windows_basic` or the :ref:`windows_bleeding_edge`.
+Alternative: Canopy
+###################
+Another software from Enthought that install all Theano dependancy.
+If you are affiliated with a university (as student or employee), you
+can download the installation for free.
+- Install Canopy x64, and update it to the latest version (`Help /
+  Software updates...`), as older Canopy versions have trouble installing
+  `pip`.
+- Then install `pip` from Canopy Package Manager.
+- In the Windows shell (type `cmd` in the Windows start menu to bring it up),
+  type `pip install theano`.
+- In Canopy Package Manager, search and install packages "mingw 4.5.2" and "libpython 1.2"
+- (Needed only for Theano 0.6rc3 or earlier)
+  The "libpython 1.2" package installs files `libpython27.a` and `libmsvcr90.a` to
+  `C:\Users\<USER>\AppData\Local\Enthought\Canopy\User\libs`. Copy the two files to
+  `C:\Users\<USER>\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.0.0.1160.win-x86_64\libs`.
+- (Needed only for Theano 0.6rc3 or earlier) Set the Theano flags
+  ``blas.ldflags=-LC:\Users\<USER>\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.0.0.1160.win-x86_64\Scripts -lmk2_core -lmk2_intel_thread -lmk2_rt``.
 Alternative: AnacondaCE
 #######################
 ContinuumIO_ is providing a free Python distribution for Windows (32-bit
 and 64-bit), including all dependencies of Theano. If you are not
-eligible for a download of EPD (via a commercial, or free academic
+eligible for a download of EPD or Canopy (via a commercial, or free academic
 licence), this is the easiest way to install
 Theano's dependencies. Simply download and execute the installer from
 `AnacondaCE downlowad page <http://continuum.io/anacondace.html>`__,

--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -39,7 +39,7 @@ probably do something similar on older computer.
 Installation steps
 ~~~~~~~~~~~~~~~~~~
-Ubuntu 11.10/12.04/12.10:
+Ubuntu 11.10/12.04/12.10/13.04:
 1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
 2) ``sudo pip install Theano``
@@ -62,6 +62,13 @@ Test the newly installed packages
 2) SciPy (~1m): ``python -c "import scipy; scipy.test()"``
 3) Theano (~30m): ``python -c "import theano; theano.test()"``
+NumPy 1.6.2, 1.7.0 and 1.7.1, have a bug where it marks some ndarrays
+as not aligned. Theano does not support unaligned arrays, and raises
+an Exception when that happens.  This can cause one test to fail with
+an unaligned error with those versions of NumPy. You can ignore that
+test error as at worst, your code will crash. If this happens, you can
+install another NumPy version to fix this problem. NumPy 1.6.2 is used
+in Ubuntu 12.10 and NumPy 1.7.1 is used in Ubuntu 13.04.
 Speed test Theano/BLAS
 ~~~~~~~~~~~~~~~~~~~~~~

--- a/doc/library/sandbox/linalg.txt
+++ b/doc/library/sandbox/linalg.txt
@@ -17,4 +17,5 @@ API
 .. automodule:: theano.sandbox.linalg.ops
    :members:
+.. automodule:: theano.sandbox.linalg.kron
+    :members:
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -4,11 +4,23 @@
 :mod:`nnet` -- Ops for neural networks
 ======================================================
-.. module:: nnet
+.. module:: tensor.nnet
   :platform: Unix, Windows
   :synopsis: Ops for neural networks
 .. moduleauthor:: LISA
+- Sigmoid
+   - :func:`sigmoid`
+   - :func:`ultra_fast_sigmoid`
+   - :func:`hard_sigmoid`
+- Others
+   - :func:`softplus`
+   - :func:`softmax`
+   - :func:`binary_crossentropy`
+   - :func:`.categorical_crossentropy`
 .. function:: sigmoid(x)
   Returns the standard sigmoid nonlinearity applied to x
@@ -72,6 +84,7 @@
   .. note:: The underlying code will return an exact 0 or 1 if an
      element of x is too small or too big.
 .. function:: softplus(x)
   Returns the softplus nonlinearity applied to x

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1446,8 +1446,25 @@ def std_lib_dirs_and_libs():
        libname = 'python' + python_version.replace('.', '')
        # Also add directory containing the Python library to the library
        # directories.
-        python_lib_dir = os.path.join(os.path.dirname(python_inc), 'libs')
+        python_lib_dirs = [os.path.join(os.path.dirname(python_inc), 'libs')]
-        return [libname], [python_lib_dir]
+        if "Canopy" in python_lib_dirs[0]:
+            # Canopy store libpython27.a and libmsccr90.a in this directory.
+            # For some reason, these files are needed when compiling Python
+            # modules, even when libpython27.lib and python27.dll are
+            # available, and the *.a files have to be found earlier than
+            # the other ones.
+            libdir = os.path.join(sys.base_prefix, '..', '..', '..',
+                                  'User', 'libs')
+            for f, lib in [('libpython27.a', 'libpython 1.2'),
+                           ('libmsvcr90.a', 'mingw 4.5.2')]:
+                if not os.path.exists(os.path.join(libdir, f)):
+                    print ("Your python version is from Canopy. " +
+                           "You need to install the package '" + lib +
+                           "' from Canopy package manager."
+                           )
+            python_lib_dirs.insert(0, libdir)
+        return [libname], python_lib_dirs
    # Suppress -lpython2.x on OS X since the `-undefined dynamic_lookup`
    # makes it unnecessary.

--- a/theano/gof/compiledir.py
+++ b/theano/gof/compiledir.py
@@ -5,6 +5,7 @@ import platform
 import re
 import shutil
 import struct
+import socket
 import subprocess
 import sys
 import textwrap
@@ -67,6 +68,7 @@ compiledir_format_dict = {
        "theano_version": theano.__version__,
        "numpy_version": numpy.__version__,
        "gxx_version": gcc_version_str.replace(" ", "_"),
+        "hostname": socket.gethostname(),
        }
 compiledir_format_keys = ", ".join(sorted(compiledir_format_dict.keys()))
 default_compiledir_format = ("compiledir_%(platform)s-%(processor)s-"

--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -116,6 +116,11 @@ def deprecated(filename, msg=''):
 def uniq(seq):
+    """
+    Do not use set, this must always return the same value at the same index.
+    If we just exchange other values, but keep the same pattern of duplication,
+    we must keep the same order.
+    """
    #TODO: consider building a set out of seq so that the if condition
    #is constant time -JB
    return [x for i, x in enumerate(seq) if seq.index(x) == i]

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -488,7 +488,6 @@ class GpuDimShuffle(GpuOp):
            print sio.getvalue()
            print '--------------------------------------'
            if 0:
-                import sys
                sys.exit()
        return sio.getvalue()
@@ -911,7 +910,8 @@ class GpuCAReduce(GpuOp):
        dummy_name = name + '_scalar_op'+ str(self._n_scalar_op_calls)
        self._n_scalar_op_calls += 1
-        return self.scalar_op.c_code(node, name, (left, right), (left, ), sub)
+        return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
+                                     (left,), sub)
    def _k_reduce_buf(self, z_pos, node, name, sub):
        """
@@ -1189,7 +1189,10 @@ class GpuCAReduce(GpuOp):
        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
    def c_code_reduce_10(self, sio, node, name, x, z, fail):
-        self._op_guard()
+        if not isinstance(self.scalar_op, (scal.Add,
+                                           scal.Maximum,
+                                           scal.Minimum)):
+            raise NotImplementedError()
        print >> sio, """
        {
            int verbose = 0;
@@ -1736,42 +1739,29 @@ class GpuCAReduce(GpuOp):
            # extra 0, I would need to change the behavior of the sum reduction
            # code to do that. I don't want to benchmark and test changes to the
            # sum code so I will leave that for later.
-            # max reduction is also a special case that is simple to implement.
+            # max/min reduction is also a special case that is simple to implement.
            # this is the special case where reduction is idempotent so it doesn't
            # matter if we reduce with the first element multiple times.
-            if isinstance(self.scalar_op, scal.Add):
+            if isinstance(self.scalar_op, (scal.Add, scal.Maximum, scal.Minimum)):
-                # special cased sum code (special case because starts the
+                # special cased max/min code (special case because visits first
-                # reduction with 0)
-                print >> sio, """
-                %(decl)s{
-                    %(init)s
-                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-                      myresult = 0;
-                      %(for_i1)s{
-                        %(for_i2)s{
-                          %(for_i3)s{
-                            float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
-                            myresult += Ai;
-                          }
-                        }
-                      }
-                      %(reducebuf)s
-                    }
-                }
-                """ % locals()
-            elif isinstance(self.scalar_op, scal.Maximum):
-                # special cased max code (special case because visits first
                # member of each row twice)
+                if isinstance(self.scalar_op, scal.Add):
+                    reduce_init = "0.f;"
+                else:
+                    reduce_init = "A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0];" % locals()
+                reduce_fct = self._assign_reduce(
+                    node, nodename, "myresult",
+                    "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
+                    {})
                print >> sio, """
                %(decl)s{
                    %(init)s
                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-                      myresult = A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0];
+                      myresult = %(reduce_init)s;
                      %(for_i1)s{
                        %(for_i2)s{
                          %(for_i3)s{
-                            float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
+                            %(reduce_fct)s;
-                            myresult = max(myresult, Ai);
                          }
                        }
                      }
@@ -1791,14 +1781,25 @@ class GpuCAReduce(GpuOp):
                # code to make sure it does not cause a slowdown
                raise NotImplementedError()
        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
-            self._op_guard()
+            if not isinstance(self.scalar_op, (scal.Add,
+                                               scal.Maximum,
+                                               scal.Minimum)):
+                raise NotImplementedError()
            # this kernel uses one block for each column,
            # threads per block for each element per column.
            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
            #      c_contiguous (typical case) then each warp is accessing non-contigous
            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]', node, nodename, sub = {})
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]',
+                                           node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            if isinstance(self.scalar_op, scal.Add):
+                reduce_init = "0.f;"
+            else:
+                reduce_init = "A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2];"
            print >> sio, """
            static __global__ void kernel_reduce_010_%(nodename)s(
                    const int d0,
@@ -1822,10 +1823,10 @@ class GpuCAReduce(GpuOp):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float myresult = 0.0f;
+                        float myresult = %(reduce_init)s;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
-                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                            %(reduce_fct)s;
                        }
                        %(reducebuf)s
                    }
@@ -2307,6 +2308,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            return ()
        return (3, hv)
 class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
@@ -3104,3 +3106,95 @@ def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
                               for i in node.inputs]),
                    print str([getattr(i, 'dtype', None)
                               for i in node.outputs])
+class GpuEye(GpuOp):
+    def __init__(self, dtype=None):
+        if dtype is None:
+            dtype = config.floatX
+        assert dtype == 'float32'
+        self.dtype = dtype
+    def make_node(self, n, m, k):
+        n = tensor.as_tensor_variable(n)
+        m = tensor.as_tensor_variable(m)
+        k = tensor.as_tensor_variable(k)
+        assert n.ndim == 0
+        assert m.ndim == 0
+        assert k.ndim == 0
+        # k != 0 isn't implemented on the GPU yet.
+        assert tensor.get_scalar_constant_value(k) == 0
+        return Apply(self, [n, m], [matrix(dtype=self.dtype)])
+    def infer_shape(self, node, in_shapes):
+        out_shape = [node.inputs[0], node.inputs[1]]
+        return [out_shape]
+    def grad(self, inp, grads):
+        return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
+    def __eq__(self, other):
+        return type(self) == type(other) and self.dtype == other.dtype
+    def __hash__(self):
+        return hash(self.dtype) ^ hash(type(self))
+    def c_support_code(self):
+        return """
+//Only 1 block is used.
+__global__ void kEye(float* a, int n, int m) {
+    int nb_elem = min(n, m);
+    for (unsigned int i = threadIdx.x; i < nb_elem; i += blockDim.x) {
+        a[i*m + i] = 1;
+    }
+}"""
+    def c_code(self, node, name, inp, out, sub):
+        n, m = inp
+        z, = out
+        fail = sub['fail']
+        s = """
+        int dims[] = {0, 0};
+        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
+        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
+        int total_size = dims[0] * dims[1] * sizeof(float);
+        cudaError_t sts;
+        void * orig_z = %(z)s;
+        if (CudaNdarray_prep_output(&%(z)s, 2, dims))
+        {
+            %(fail)s;
+        }
+        sts = cudaMemset(CudaNdarray_DEV_DATA(%(z)s), 0, total_size);
+        if (cudaSuccess != sts)
+        {
+            PyErr_Format(PyExc_MemoryError,
+                         "GpuEye: Error in memset %%d bytes of device memory.",
+                         total_size);
+            if(orig_z == NULL)
+                Py_XDECREF(%(z)s);
+            %(fail)s;
+        }
+        kEye<<<1, 256>>>(CudaNdarray_DEV_DATA(%(z)s), dims[0], dims[1]);
+        CNDA_THREAD_SYNC;
+        sts = cudaGetLastError();
+        if (cudaSuccess != sts)
+        {
+               PyErr_Format(PyExc_RuntimeError,
+                    "Cuda error: kEye: %%s. n=%%d, m=%%d.",
+                    cudaGetErrorString(sts),
+                    dims[0], dims[1]);
+            %(fail)s;
+         }
+        """ % locals()
+        return s
+    def c_code_cache_version(self):
+        return (3,)
+gpu_eye = GpuEye(dtype='float32')
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
-"""
+"""This file implement 3 different version of the elemwise op on the
-This file implement 3 different version of the elemwise op on the gpu. Only NaiveAlgo is used and it is not very naive now.
+gpu. Only NaiveAlgo is used and it is not very naive now.
+The elemwise fct are also used with scalar operation! So it can happen
+that ndim is 0 as with all scalar type.
-The elemwise fct are also used with scalar operation! So it can happen that ndim is 0 as with all scalar type.
 """
@@ -25,20 +27,26 @@ _logger.addHandler(logging.StreamHandler()) #TO REMOVE
 def _logical_scalar(x):
    return numpy.all(x.type.broadcastable)
-def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i_data[0]'):
-    l=[]
+def get_str_list_logical_scalar(node, value_str='ii_i%i_value',
+                                data_str='ii_i%i_data[0]'):
+    l = []
    for ipos, i in enumerate(node.inputs):
        if _logical_scalar(i):
-            l+=[value_str%ipos]
+            l += [value_str % ipos]
-        else: l+=[data_str%ipos]
+        else:
+            l += [data_str % ipos]
    return l
 class SupportCodeError(Exception):
    """It is currently not possible to auto-generate a GPU implementation for
-    an elementwise Op with support code."""
+    an elementwise Op with c_support_code_apply().
+    But we support Op.c_support_code."""
 class NaiveAlgo(object):
-    verbose = 0 # 1, 2 or 3 for more verbose output.
+    verbose = 0  # 1, 2 or 3 for more verbose output.
    @property
    def cache_version(self):
@@ -73,16 +81,20 @@ class NaiveAlgo(object):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
-        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (self.scalar_op.__class__.__name__,nodename, nd)
+        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
+            self.scalar_op.__class__.__name__, nodename, nd)
        if (nd):
-            print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
+            print >> sio, "\t,", ", ".join("const int dim%i" % i
+                                           for i in xrange(nd))
        #declare inputs
        for ipos, i in enumerate(node.inputs):
-            s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
+            s = ", ".join(["const float * i%i_data" % ipos] +
+                          ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print >> sio, "\t,", s
        #declare outputs
        for ipos, i in enumerate(node.outputs):
-            s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
+            s = ", ".join(["float * o%i_data" % ipos] +
+                          ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print >> sio, "\t,", s
            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
            #print >> sio, "\t,", "float * o%i_data" % ipos
@@ -121,13 +133,15 @@ class NaiveAlgo(object):
        # perform the scalar operation on the input and output references
        #TODO: What if the scalar_op needs support_code??
        task_code = self.scalar_op.c_code(
-                Apply(self.scalar_op,
+            Apply(self.scalar_op,
-                    [scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs],
+                  [scalar.Scalar(dtype=input.type.dtype)()
-                    [scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs])
+                   for input in node.inputs],
-                , nodename + '_scalar_'
+                  [scalar.Scalar(dtype=output.type.dtype)()
-                , get_str_list_logical_scalar(node)
+                   for output in node.outputs]),
-                , ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
+            nodename + '_scalar_',
-                , sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
+            get_str_list_logical_scalar(node),
+            ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
+            sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
        print >> sio, "       ", task_code
        print >> sio, "    }"
@@ -837,10 +851,9 @@ nd_collapse_[i]=0;
        #N.B. cudaGetLastError is called by c_code
        return sio.getvalue()
    def c_support_code_apply(self, node, nodename):
        nd = node.outputs[0].type.ndim
-        defines =  """
+        defines = """
 #define INTDIV_POW2(a, b) (a >> b)
 #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
        """
@@ -851,7 +864,7 @@ nd_collapse_[i]=0;
        return defines + kernels
    def c_support_code(self):
-        raise gof.utils.MethodNotDefined()
+        return self.scalar_op.c_support_code()
    def c_code(self, node, nodename, inputs, outputs, sub):
        d = dict(sub)
@@ -867,7 +880,7 @@ nd_collapse_[i]=0;
            print >> sio, """
        //std::cerr << "C_CODE %(opname)s START\\n";
        //standard elemwise size checks
-            """ %locals()
+            """ % locals()
        if nd > 0:
            print >> sio, """
            int dims[%(nd)s] = {%(initial_dims)s};
@@ -931,11 +944,11 @@ nd_collapse_[i]=0;
                %(fail)s;
            }
        }
-            """ %locals()
+            """ % locals()
            emitted_inames[iname] = True
        #check that all outputs have valid dimensions
-        for idx,oname in enumerate(outputs):
+        for idx, oname in enumerate(outputs):
            if idx not in self.inplace_pattern.keys():
                print >> sio, """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
@@ -1037,7 +1050,7 @@ nd_collapse_[i]=0;
 class ErfinvGPU(Erfinv):
    """
    Provides a c-code implementation of the inverse error function for GPU.
    Note: We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
    currently rely on Nvidia's cublas library to provide the erfinv
    c-implementation (which requires different c_headers). As it stands,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -597,7 +597,7 @@ def local_gpu_careduce(node):
        scalar_op = node.op.scalar_op
        # currently, only these two ops are supported at all,
        # and max does not support all combinations of axes
-        if node.op.scalar_op in [scal.add, scal.maximum]:
+        if node.op.scalar_op in [scal.add, scal.maximum, scal.minimum]:
            x, = node.inputs
            if x.owner and x.owner.op == host_from_gpu:
                if node.op.axis is None:
@@ -1354,6 +1354,27 @@ def local_gpualloc_memset_0(node):
            return [new_out]
+@register_opt()
+@local_optimizer([])
+def local_gpu_eye(node):
+    """
+    gpu_from_host(eye) -> gpueye(gpu_from_host)
+    eye(host_from_gpu) -> host_from_gpu(gpueye)
+    """
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if (host_input.owner and
+            isinstance(host_input.owner.op, tensor.Eye) and
+            host_input.owner.op.dtype == "float32"):
+            return [gpu_eye(*host_input.owner.inputs)]
+    if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
+        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+                      for i in node.inputs]):
+            return [host_from_gpu(gpu_eye(*node.inputs))]
+    return False
 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -66,7 +66,8 @@ def test_careduce():
    """
    for scalar_op, careduce_op in [
            (theano.scalar.add, tensor.elemwise.CAReduceDtype),
-            (theano.scalar.maximum, tensor.CAReduce)]:
+            (theano.scalar.maximum, tensor.CAReduce),
+            (theano.scalar.minimum, tensor.CAReduce)]:
        for shape, pattern in [((1,1),(1,)),
                               ((1,0),(1,)),
                               ((0,1),(1,)),
@@ -123,9 +124,10 @@ def test_careduce():
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
-            #GpuCAReduce{maximum} support only those patterns
+            #GpuCAReduce{maximum/minimum} support only those patterns
-            if scalar_op is theano.scalar.maximum and pat not in [
+            if scalar_op in [theano.scalar.maximum,
-                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
                continue
            a = tensor.TensorType('float32', (False,) * len(shape))()
@@ -191,10 +193,12 @@ def test_careduce():
                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
-            #GpuCAReduce{maximum} support only those patterns
+            #GpuCAReduce{maximum/minimum} support only those patterns
-            if scalar_op is theano.scalar.maximum and pat not in [
+            if scalar_op in [theano.scalar.maximum,
-                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
                continue
            a = tensor.TensorType('float32', (False,) * len(shape))()
            dim_pattern = range(len(shape))
            dim_pattern[0] = 1
@@ -223,10 +227,12 @@ def test_careduce():
                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
-            #GpuCAReduce{maximum} support only those patterns
+            #GpuCAReduce{maximum/minimum} support only those patterns
-            if scalar_op is theano.scalar.maximum and pat not in [
+            if scalar_op in [theano.scalar.maximum,
-                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
                continue
            shape = numpy.asarray(shape) * 2
            a = tensor.TensorType('float32', (False,) * len(shape))()
            a2 = tcn.CudaNdarrayType((False,) * len(shape))()
@@ -1131,6 +1137,35 @@ def test_shared_cudandarray():
    assert isinstance(a.type, tcn.CudaNdarrayType)
+def test_gpueye():
+    def check(dtype, N, M_=None):
+        # Theano does not accept None as a tensor.
+        # So we must use a real value.
+        M = M_
+        # Currently DebugMode does not support None as inputs even if this is
+        # allowed.
+        if M is None:
+            M = N
+        N_symb = T.iscalar()
+        M_symb = T.iscalar()
+        k_symb = numpy.asarray(0)
+        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
+        f = theano.function([N_symb, M_symb],
+                            B.as_cuda_ndarray_variable(out),
+                            mode=mode_with_gpu)
+        result = numpy.asarray(f(N, M))
+        assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
+        assert result.dtype == numpy.dtype(dtype)
+        assert any([isinstance(node.op, B.GpuEye)
+                    for node in f.maker.fgraph.toposort()])
+    for dtype in ['float32']:
+        yield check, dtype, 3
+        # M != N, k = 0
+        yield check, dtype, 3, 5
+        yield check, dtype, 5, 3
 class test_size(unittest.TestCase):
    """

--- a/theano/sandbox/linalg/__init__.py
+++ b/theano/sandbox/linalg/__init__.py
+from kron import kron
 from ops import (cholesky, matrix_inverse, solve,
        diag, extract_diag, alloc_diag,
        det, psd, eig, eigh,

--- a/theano/sandbox/linalg/kron.py
+++ b/theano/sandbox/linalg/kron.py
-import theano
-from theano.gof import Op, Apply
 from theano import tensor
-try:
-    import scipy.linalg
-    imported_scipy = True
-except ImportError:
-    imported_scipy = False
+def kron(a, b):
+    """ Kronecker product
-class Kron(Op):
+    Same as scipy.linalg.kron(a, b).
-    """
-    Kronecker product of a and b.
-    Parameters:
-    a: array, shape (M, N)
-    b: array, shape (P, Q)
-    Returns:
-    A: array, shape (M*P, N*Q)
+    :note: numpy.kron(a, b) != scipy.linalg.kron(a, b)!
+        They don't have the same shape and order when
+        a.ndim != b.ndim != 2.
-    The result is the block matrix:
+    :param a: array_like
-    (notice that a[i,j]*b is itself a matrix of the same shape as b)
+    :param b: array_like
+    :return: array_like with a.ndim + b.ndim - 2 dimensions.
-    a[0,0]*b    a[0,1]*b  ... a[0,-1]*b
-    a[1,0]*b    a[1,1]*b  ... a[1,-1]*b
-    ...
-    a[-1,0]*b   a[-1,1]*b ... a[-1,-1]*b
    """
+    a = tensor.as_tensor_variable(a)
-    def __eq__(self, other):
+    b = tensor.as_tensor_variable(b)
-        return type(self) == type(other)
+    if (a.ndim + b.ndim <= 2):
+        raise TypeError('kron: inputs dimensions must sum to 3 or more. '
-    def __hash__(self):
+                        'You passed %d and %d.' % (a.ndim, b.ndim))
-        return hash(type(self))
+    o = tensor.outer(a, b)
+    o = o.reshape(tensor.concatenate((a.shape, b.shape)),
-    def __str__(self):
+                  a.ndim + b.ndim)
-        return "%s" % self.__class__.__name__
+    shf = o.dimshuffle(0, 2, 1, * range(3, o.ndim))
+    if shf.ndim == 3:
-    def make_node(self, a, b):
+        shf = o.dimshuffle(1, 0, 2)
-        assert imported_scipy, (
+        o = shf.flatten()
-            "Scipy not available. Scipy is needed for the Kron op")
+    else:
-        a = tensor.as_tensor_variable(a)
+        o = shf.reshape((o.shape[0] * o.shape[2],
-        b = tensor.as_tensor_variable(b)
+                         o.shape[1] * o.shape[3]) +
+                        tuple([o.shape[i] for i in range(4, o.ndim)]))
-        if (not a.ndim == 2 or not b.ndim == 2):
+    return o
-            raise TypeError('%s: inputs must have two dimensions' %
-                            self.__class__.__name__)
-        out_var = tensor.TensorType(dtype=theano.scalar.upcast(a, b),
-                                     broadcastable=(False, False))()
-        return Apply(self, [a, b], [out_var])
-    def infer_shape(self, node, in_shapes):
-        shape_a, shape_b = in_shapes
-        return [[shape_a[0] * shape_b[0], shape_a[1] * shape_b[1]]]
-    def perform(self, node, inputs, output_storage):
-        a, b = inputs
-        output_storage[0][0] = scipy.linalg.kron(a, b)
-    def grad(self, inputs, cost_grad):
-        raise NotImplementedError('%s: gradient is not currently'
-                ' implemented' % self.__class__.__name__)
-kron = Kron()
--- a/theano/sandbox/linalg/tests/test_kron.py
+++ b/theano/sandbox/linalg/tests/test_kron.py
 from nose.plugins.skip import SkipTest
 import numpy
+import theano
 from theano import tensor, function
 from theano.tests import unittest_tools as utt
-from theano.sandbox.linalg.kron import Kron, kron
+from theano.sandbox.linalg.kron import kron
+floatX = theano.config.floatX
 try:
    import scipy.linalg
@@ -11,9 +14,6 @@ try:
 except ImportError:
    imported_scipy = False
-if not imported_scipy:
-    raise SkipTest('Kron Op need the scipy package to be installed')
 class TestKron(utt.InferShapeTester):
@@ -21,32 +21,41 @@ class TestKron(utt.InferShapeTester):
    def setUp(self):
        super(TestKron, self).setUp()
-        self.op_class = Kron
        self.op = kron
    def test_perform(self):
-        x = tensor.dmatrix()
+        if not imported_scipy:
-        y = tensor.dmatrix()
+            raise SkipTest('kron tests need the scipy package to be installed')
-        f = function([x, y], kron(x, y))
+        for shp0 in [(2,), (2, 3), (2, 3, 4), (2, 3, 4, 5)]:
-        for shp0 in [(8, 6), (5, 8)]:
+            for shp1 in [(6,), (6, 7), (6, 7, 8), (6, 7, 8, 9)]:
-            for shp1 in [(5, 7), (3, 3)]:
+                if len(shp0) + len(shp1) == 2:
-                a = numpy.random.rand(*shp0)
+                    continue
-                b = numpy.random.rand(*shp1)
+                x = tensor.tensor(dtype='floatX',
+                                  broadcastable=(False,) * len(shp0))
+                y = tensor.tensor(dtype='floatX',
+                                  broadcastable=(False,) * len(shp1))
+                f = function([x, y], kron(x, y))
+                a = numpy.asarray(self.rng.rand(*shp0)).astype(floatX)
+                b = self.rng.rand(*shp1).astype(floatX)
                out = f(a, b)
                assert numpy.allclose(out, scipy.linalg.kron(a, b))
-    def test_infer_shape(self):
+    def test_numpy_2d(self):
-        x = tensor.dmatrix()
+        for shp0 in [(2, 3)]:
-        y = tensor.dmatrix()
+            for shp1 in [(6, 7)]:
-        self._compile_and_check([x, y], [self.op(x, y)],
+                if len(shp0) + len(shp1) == 2:
-                                [numpy.random.rand(8, 5),
+                    continue
-                                 numpy.random.rand(3, 7)],
+                x = tensor.tensor(dtype='floatX',
-                                self.op_class)
+                                  broadcastable=(False,) * len(shp0))
-        self._compile_and_check([x, y], [self.op(x, y)],
+                y = tensor.tensor(dtype='floatX',
-                                [numpy.random.rand(2, 5),
+                                  broadcastable=(False,) * len(shp1))
-                                 numpy.random.rand(6, 3)],
+                f = function([x, y], kron(x, y))
-                                self.op_class)
+                a = numpy.asarray(self.rng.rand(*shp0)).astype(floatX)
+                b = self.rng.rand(*shp1).astype(floatX)
+                out = f(a, b)
+                assert numpy.allclose(out, numpy.kron(a, b))
 if __name__ == "__main__":
    t = TestKron('setUp')

--- a/theano/scalar/basic_scipy.py
+++ b/theano/scalar/basic_scipy.py
@@ -220,9 +220,16 @@ class Psi(UnaryScalarOp):
    def c_support_code(self):
        return (
 """
+// For GPU support
+#ifdef __CUDACC__
+#define DEVICE __device__
+#else
+#define DEVICE
+#endif
 #ifndef _PSIFUNCDEFINED
 #define _PSIFUNCDEFINED
-double _psi(double x){
+DEVICE double _psi(double x){
    /*taken from
    Bernardo, J. M. (1976). Algorithm AS 103:

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -1733,7 +1733,10 @@ class AddSD(gof.op.Op):
        x, y = as_sparse_variable(x), tensor.as_tensor_variable(y)
        if x.type.dtype != y.type.dtype:
-            raise NotImplementedError()
+            raise NotImplementedError(
+                "AddSD support inputs with the same dtype only."
+                " You passed %s and %s inputs dtype." % (x.type.dtype,
+                                                         y.type.dtype))
        indices, indptr, data = csm_indices(x), csm_indptr(x), csm_data(x)
@@ -1750,7 +1753,7 @@ class AddSD(gof.op.Op):
    def c_code(self, node, name, (_data, _indices, _indptr, y), (z, ), sub):
        inplace = int(self.inplace)
-        format = {'csc': 0, 'csr':1}[self.format]
+        format = {'csc': 0, 'csr': 1}[self.format]
        code = """
                Py_XDECREF(%(z)s);
                if (!%(inplace)s){
@@ -1759,7 +1762,7 @@ class AddSD(gof.op.Op):
                  %(z)s = %(y)s;
                  Py_XINCREF(%(z)s);
                }
                npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
                const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
                const npy_int32 * __restrict__ indices = (npy_int32*)%(_indices)s->data;
@@ -1795,9 +1798,9 @@ class AddSD(gof.op.Op):
        assert _is_dense(y)
        if self.format == 'csr':
-            x = scipy.sparse.csr_matrix((data, indices, indptr), shape = y.shape)
+            x = scipy.sparse.csr_matrix((data, indices, indptr), shape=y.shape)
        elif self.format == 'csc':
-            x = scipy.sparse.csc_matrix((data, indices, indptr), shape = y.shape)
+            x = scipy.sparse.csc_matrix((data, indices, indptr), shape=y.shape)
        # The asarray is needed as in some case, this return a
        # numpy.matrixlib.defmatrix.matrix object and not an ndarray.

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -506,8 +506,8 @@ class T_AddMul(unittest.TestCase):
            elif op is mul:
                self.assertTrue(_is_sparse_variable(apb))
                self.assertTrue(numpy.all(val.todense() == (b.multiply(a))))
-                self.assertTrue(numpy.all(val.todense() == numpy.array([[1, 0],
+                self.assertTrue(numpy.all(val.todense() == numpy.array(
-[9, 0], [0, 36]])))
+                    [[1, 0], [9, 0], [0, 36]])))
    def _testDS(self, op, array1=numpy.array([[1., 0], [3, 0], [0, 6]]),
                array2=numpy.asarray([[0, 2.], [0, 4], [5, 0]])):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3621,6 +3621,9 @@ class Eye(gof.Op):
        n = as_tensor_variable(n)
        m = as_tensor_variable(m)
        k = as_tensor_variable(k)
+        assert n.ndim == 0
+        assert m.ndim == 0
+        assert k.ndim == 0
        return gof.Apply(self, [n, m, k],
                [TensorType(dtype=self.dtype, broadcastable=(False, False))()])
@@ -8185,7 +8188,14 @@ def tensordot(a, b, axes=2):
 def outer(x, y):
-    """Return vector-vector outer product."""
+    """Return vector-vector outer product.
+    If an input isn't a vector, we flatten it first.
+    """
+    if x.ndim != 1:
+        x = x.flatten()
+    if y.ndim != 1:
+        y = y.flatten()
    return dot(
            x.dimshuffle(0, 'x'),
            y.dimshuffle('x', 0))

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -197,6 +197,34 @@ def default_blas_ldflags():
                return ' '.join(
                    ['-L%s' % os.path.join(sys.prefix, "lib")] +
                    ['-l%s' % l for l in blas_info['libraries']])
+        #Canopy
+        if "Canopy" in sys.prefix:
+            if sys.platform == "darwin":
+                p2 = os.path.join(sys.base_prefix, "lib")
+                assert os.path.exists(p2), "Canopy changed the location of MKL"
+                return ' '.join(
+                    ['-L%s' % p2] +
+                    ['-l%s' % l for l in blas_info['libraries']])
+            p = os.path.join(sys.base_prefix, "..", "..", "appdata")
+            assert os.path.exists(p), "Canopy changed the location of MKL"
+            p2 = os.listdir(p)
+            assert len(p2) == 1, "Canopy changed the location of MKL"
+            if sys.platform == "linux2":
+                p2 = os.path.join(p, p2[0], "lib")
+                assert os.path.exists(p2), "Canopy changed the location of MKL"
+                return ' '.join(
+                    ['-L%s' % p2] +
+                    ['-l%s' % l for l in blas_info['libraries']])
+            elif sys.platform == 'win32':
+                p2 = os.path.join(p, p2[0], "Scripts")
+                assert os.path.exists(p2), "Canopy changed the location of MKL"
+                return ' '.join(
+                    ['-L%s' % p2] +
+                    # Why on Windows, the library used are not the
+                    # same as what is in blas_info['libraries']?
+                    ['-l%s' % l for l in ["mk2_core", "mk2_intel_thread",
+                                          "mk2_rt"]])
        #if numpy was linked with library that are not installed, we
        #can't reuse them.
@@ -1921,9 +1949,9 @@ def local_dot22_to_dot22scalar(node):
    scalar_idx = -1
    for i, x in enumerate(node.inputs):
-        if (i_scalar[i] is not None
+        if (i != dot22_idx and i_scalar[i] is not None and
-                and (theano.scalar.upcast(x.type.dtype, d.type.dtype)
+            (theano.scalar.upcast(x.type.dtype, d.type.dtype) ==
-                    == d.type.dtype)):
+             d.type.dtype)):
            scalar_idx = i
            break
    if scalar_idx < 0:

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -961,6 +961,12 @@ class Elemwise(Op):
        inames = gof.utils.uniq(inames)
        inputs = gof.utils.uniq(node.inputs)
+        # assert that inames and inputs order stay consistent.
+        # This is to protect again futur change of uniq.
+        assert len(inames) == len(inputs)
+        ii, iii = zip(*gof.utils.uniq(zip(_inames, node.inputs)))
+        assert all([x == y for x,y in zip(ii, inames)])
+        assert all([x == y for x,y in zip(iii, inputs)])
        defines = ""
        undefs = ""

--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -3,6 +3,7 @@ from itertools import imap
 import numpy
+from theano.compat.python2x import any
 import theano.tensor.inplace
 from theano.tensor import basic as tensor
 from theano import tensor as T

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2729,6 +2729,42 @@ class T_min_max(unittest.TestCase):
        #axis=1)[0], n)),axis=1)
+class T_outer(unittest.TestCase):
+    def test_outer(self):
+        for m in range(4):
+            for n in range(4):
+                x = tensor.tensor(dtype='floatX', broadcastable=(False,) * m)
+                y = tensor.tensor(dtype='floatX', broadcastable=(False,) * n)
+                s1 = numpy.random.randint(1, 10, m)
+                s2 = numpy.random.randint(1, 10, n)
+                v1 = numpy.asarray(numpy.random.rand(*s1)).astype(floatX)
+                v2 = numpy.asarray(numpy.random.rand(*s2)).astype(floatX)
+                o = tensor.outer(x, y).eval({x: v1, y: v2})
+                assert_allclose(o, numpy.outer(v1, v2))
+    def test_grad(self):
+        """
+        Test the combined graph of the graph of outer
+        with broadcastable dimensions, just in case.
+        """
+        for shp0, shp1 in [((1,), (2,)),
+                           ((3,), (1,)),
+                           ((1,), (1,)),
+                           ((3,), (2,)),
+                           ((3, 2), (1, 1)),
+                           ((3, 2), (1, 4)),
+                           ((3, 2), (4, 1)),
+                           ((3, 2), (4, 5)),
+                           ((1, 2), (4, 5)),
+                           ((3, 1), (4, 5)),
+                           ((1, 1), (4, 5)),
+                           ((1, 1), (1, 1)),
+                       ]:
+            data0 = numpy.random.rand(*shp0).astype(floatX)
+            data1 = numpy.random.rand(*shp1).astype(floatX)
+            utt.verify_grad(tensor.outer, [data0, data1])
 class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
    """
    This is build in a way that allow to reuse it to test the

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -875,8 +875,8 @@ class test_fusion(unittest.TestCase):
                fyv * fzv, 'float32'),  # 2
            (fx * fy + fz, (fx, fy, fz), (fxv, fyv, fzv), 1, fxv *
                fyv + fzv, 'float32'),  # 3
-            (fw + fx + fy + fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1, fwv +
+            (fw + fx + fy + fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
-                fxv + fyv + fzv, 'float32'),
+                 fwv + fxv + fyv + fzv, 'float32'),
            ((fw + fx) + (fy + fz), (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
                fwv + fxv + fyv + fzv, 'float32'),  # 5
            (((fw + fx) + fy) + fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
@@ -886,29 +886,29 @@ class test_fusion(unittest.TestCase):
            ((fw + (fx + fy) + fz), (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
                fwv + fxv + fyv + fzv, 'float32'),
            (fw + (fx + (fy + fz)), (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
-                fwv + fxv + fyv +fzv, 'float32'),
+                fwv + fxv + fyv + fzv, 'float32'),
            ((fw+fx)+(fy+fz), (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
                fwv+fxv+fyv+fzv, 'float32'),  # 10
-            (fw*fx*fy*fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1, fwv*
+            (fw*fx*fy*fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
-                fxv*fyv*fzv, 'float32'),
+                fwv * fxv * fyv * fzv, 'float32'),
-            (fw+fx*fy*fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1, fwv+
+            (fw+fx*fy*fz, (fw, fx, fy, fz), (fwv, fxv, fyv, fzv), 1,
-                fxv*fyv*fzv, 'float32'),
+                fwv + fxv * fyv * fzv, 'float32'),
            (fx+fy*fz*fx, (fx, fy, fz), (fxv, fyv, fzv), 1,
-                 fxv+fyv*fzv*fxv, 'float32'),
+                fxv + fyv * fzv * fxv, 'float32'),
            (fx*fy+fz+fy, (fx, fy, fz), (fxv, fyv, fzv), 1,
-                 fxv*fyv+fzv+fyv, 'float32'),
+                fxv * fyv + fzv + fyv, 'float32'),
            (fx*fy*fz*fw+fx+fy+fz+fw, (fw, fx, fy, fz), (fwv, fxv,
                fyv, fzv), 1, fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv, 'float32'),  # 15
            #test with constant
-            ((fw+fx)+(fy+fz)+ 2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv)
+            ((fw+fx)+(fy+fz)+ 2.,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
-                ,1,fwv+fxv+fyv+fzv+2,'float32'),
+                1,fwv+fxv+fyv+fzv+2,'float32'),
-            (((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
+            (((fw+fx)+2.+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
                1,fwv+fxv+fyv+fzv+2,'float32'),
-            ((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
+            ((fw+(fx+2.+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
                1,fwv+fxv+fyv+fzv+2,'float32'),
            ((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
                1,fwv+fxv+fyv+fzv+2,'float32'),
-            (fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
+            (fw+(fx+(fy+fz)+2.),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
                1,fwv+fxv+fyv+fzv+2,'float32'),  # 20
            (2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),
                1,fwv+fxv+fyv+fzv+2,'float32'),
@@ -1027,16 +1027,18 @@ class test_fusion(unittest.TestCase):
        fail2 = []
        fail3 = []
        fail4 = []
-        for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
+        for id, [g, sym_inputs, val_inputs,
+                 nb_elemwise, answer, out_dtype] in enumerate(cases):
            if isinstance(out_dtype, dict):
                out_dtype = out_dtype[config.cast_policy]
-            if gpu and (out_dtype!='float32' or any(i.dtype != 'float32' for i in g.owner.inputs)):
+            if (gpu and (out_dtype != 'float32' or
+                         any(i.dtype != 'float32' for i in g.owner.inputs))):
                print "Skip test %d as the gpu code currently supports only float32" % id
                continue
            print "new cases", id
            if shared_fn is None:
-                assert gpu == False
+                assert gpu is False
                f = compile.function(list(sym_inputs), g, mode=mode)
                for x in range(nb_repeat):
                    out = f(*val_inputs)

--- a/theano/tests/main.py
+++ b/theano/tests/main.py
 import os, unittest, sys
 import nose.plugins.builtin
+from nose.config import Config
 from numpy.testing.nosetester import import_nose, NoseTester
 from numpy.testing.noseclasses import KnownFailure, NumpyTestProgram
@@ -122,7 +123,8 @@ class TheanoNoseTester(NoseTester):
        argv, plugins = self.prepare_test_args(verbose, extra_argv, coverage,
                capture, knownfailure)
-        t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins)
+        cfg = Config(includeExe=True)
+        t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins, config=cfg)
        return t.result

--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -155,6 +155,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
    data = cPickle.load(open(noseids_file, 'rb'))
    ids = data['ids']
    n_tests = len(ids)
+    if n_tests == 0:
+        raise Exception("0 test selected")
    assert n_tests == max(ids)
    # Standard batch testing is called for