Merge pull request #881 from nouiz/mixed3

Mixed3

Merge pull request #881 from nouiz/mixed3
6dcd4721 · lamblin · 9f6d9f90 · 13431f32 · 6dcd4721 · 6dcd4721
--- a/doc/optimizations.txt
+++ b/doc/optimizations.txt
@@ -16,22 +16,22 @@ The descriptions are brief and point to further reading.
 If you would like to add an additional optimization, refer to
 :ref:`optimization` in the guide to extending Theano.
-..  #COMMENT
+..  note::
-    Since the print_summary method has been added to several OpDBs and
+    This list is partial.
-    optimizers, it is possible to compute an accurate and up-to-date
-    optimization list by typing
+    The print_summary method allow several OpDBs and optimizers to list the optimization executed.
+    This allow to have an up-to-date list.
    python -c 'import theano; theano.compile.FAST_RUN.optimizer.print_summary()'
-    python -c 'import theano; theano.compile.FAST_COMPILE.optimizer.print_summary()'
-    etc.
+    python -c 'import theano; theano.compile.FAST_COMPILE.optimizer.print_summary()'
-========================================================= ========= ============
+========================================================= ========= ============ =============
-Optimization                                              FAST_RUN  FAST_COMPILE
+Optimization                                              FAST_RUN  FAST_COMPILE Stabilization
-========================================================= ========= ============
+========================================================= ========= ============ =============
 :term:`merge`                                             x         x
-:term:`constant folding<constant folding>`                x
+:term:`constant folding<constant folding>`                x         x
 :term:`shape promotion<shape promotion>`                  x
 :term:`fill cut<fill cut>`                                x
 :term:`inc_subtensor srlz.<inc_subtensor serialization>`  x
@@ -53,7 +53,8 @@ Optimization                                              FAST_RUN  FAST_COMPILE
 :term:`inplace_random`                                    x
 :term:`elemwise fusion`                                   x
 :term:`GPU transfer`                                      x
-========================================================= ========= ============
+:term:`local_log_softmax`                                 x                      x
+========================================================= ========= ============ =============
 .. glossary::
@@ -252,5 +253,8 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        See :func:`theano.sandbox.cuda.opt.*`.
+    local_log_softmax
+        This is a stabilization optimization.
+        It can happen due to rounding problem that the softmax probability of one value get to 0.
+        Taking the log of 0, would generate -inf that will probably generate NaN later.
+        We return a closer answer.
--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -47,7 +47,10 @@ file and run it.
    t1 = time.time()
    print 'Looping %d times took' % iters, t1 - t0, 'seconds'
    print 'Result is', r
-    print 'Used the', 'cpu' if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]) else 'gpu'
+    if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]):
+        print 'Used the cpu'
+    else:
+        print 'Used the gpu'
 The program just computes the exp() of a bunch of random numbers.
 Note that we use the `shared` function to
@@ -105,7 +108,10 @@ after the T.exp(x) is replaced by a GPU version of exp().
    print 'Looping %d times took' % iters, t1 - t0, 'seconds'
    print 'Result is', r
    print 'Numpy result is', numpy.asarray(r)
-    print 'Used the', 'cpu' if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]) else 'gpu'
+    if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]):
+        print 'Used the cpu'
+    else:
+        print 'Used the gpu'
 The output from this program is
@@ -161,7 +167,10 @@ that it has the un-wanted side-effect of really slowing things down.
    print 'Looping %d times took' % iters, t1 - t0, 'seconds'
    print 'Result is', r
    print 'Numpy result is', numpy.asarray(r)
-    print 'Used the', 'cpu' if numpy.any([isinstance(x.op,T.Elemwise) for x in f.maker.fgraph.toposort()]) else 'gpu'
+    if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]):
+        print 'Used the cpu'
+    else:
+        print 'Used the gpu'
 Running this version of the code takes just under 0.05 seconds, over 140x faster than
 the CPU implementation!

--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -52,7 +52,7 @@ from gof import \
     Container, \
     InconsistencyError, FunctionGraph, \
     Apply, Variable, Constant, \
-     Op, \
+     Op, OpenMPOp,\
     opt, \
     toolbox, \
     Type, Generic, generic, \
@@ -106,9 +106,6 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
        import theano.sandbox.cuda.tests.test_driver
        theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
-import configdefaults_late
 # Use config.numpy to call numpy.seterr
 import numpy
 if config.numpy.seterr_all == 'None':

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -376,3 +376,39 @@ AddConfigVar('exception_verbosity',
                C. log_likelihood_h""",
        EnumStr('low', 'high'),
        in_c_key=False)
+#Test if the env variable is set
+var = os.getenv('OMP_NUM_THREADS', None)
+if var:
+    try:
+        int(var)
+    except ValueError:
+        raise TypeError("The environment variable OMP_NUM_THREADS"
+                        " should be a number, got '%s'." % var)
+    else:
+        default_openmp = not int(var) == 1
+else:
+    #Check the number of cores availables.
+    count = cpuCount()
+    if count == -1:
+        _logger.warning("We are not able to detect the number of CPU cores."
+                        " We disable openmp by default. To remove this"
+                        " warning, set the environment variable"
+                        " OMP_NUM_THREADS to the number of threads you"
+                        " want theano to use.")
+    default_openmp = count > 1
+AddConfigVar('openmp',
+             "Allow (or not) parallel computation on the CPU with OpenMP. "
+             "This is the default value used when creating an Op that "
+             "supports OpenMP parallelization. It is preferable to define it "
+             "via the Theano configuration file ~/.theanorc or with the "
+             "environment variable THEANO_FLAGS. Parallelization is only "
+             "done for some operations that implement it, and even for "
+             "operations that implement parallelism, each operation is free "
+             "to respect this flag or not. You can control the number of "
+             "threads used with the environment variable OMP_NUM_THREADS."
+             " If it is set to 1, we disable openmp in Theano by default.",
+             BoolParam(default_openmp),
+             in_c_key=False,
+         )
--- a/theano/configdefaults_late.py
+++ b/theano/configdefaults_late.py
-"""
-This file defines Theano flags which need to be defined late in import order.
-This is needed as they rely on the values of other previously-defined flags.
-"""
-import os
-import logging
-import subprocess
-import tempfile
-import theano
-from theano.configparser import (
-        AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
-        TheanoConfigParser)
-from theano.misc.cpucount import cpuCount
-_logger = logging.getLogger('theano.configdefaults_late')
-config = TheanoConfigParser()
-#http://pyprocessing.berlios.de/
-#True if the environment variable (OMP_NUM_THREADS!=1 or
-#if we detect more then 1 CPU core) and g++ support OpenMP
-#Otherwise False.
-default_openmp = True
-#Test if the env variable is set
-var = os.getenv('OMP_NUM_THREADS', None)
-if var:
-    try:
-        int(var)
-    except ValueError:
-        raise TypeError("The environment variable OMP_NUM_THREADS"
-                        " should be a number, got '%s'." % var)
-    else:
-        default_openmp = not int(var) == 1
-else:
-    #Check the number of cores availables.
-    count = cpuCount()
-    if count == -1:
-        _logger.warning("We are not able to detect the number of CPU cores."
-                        " We disable openmp by default. To remove this"
-                        " warning, set the environment variable"
-                        " OMP_NUM_THREADS to the number of threads you"
-                        " want theano to use.")
-    default_openmp = count > 1
-dummy_stdin = open(os.devnull)
-if default_openmp and theano.configdefaults.gxx_avail:
-    #check if g++ supports openmp. We need to compile a file as the EPD
-    #version has openmp enabled in the specs file but does not include
-    #the OpenMP files.
-    try:
-        code = """
-        #include <omp.h>
-int main( int argc, const char* argv[] )
-{
-        int res[10];
-        for(int i=0; i < 10; i++){
-            res[i] = i;
-        }
-}
-        """
-        fd, path = tempfile.mkstemp(suffix='.c', prefix='test_omp_')
-        try:
-            os.write(fd, code)
-            os.close(fd)
-            fd = None
-            proc = subprocess.Popen(['g++', '-fopenmp', path],
-                                    stdout=subprocess.PIPE,
-                                    stderr=subprocess.PIPE,
-                                    stdin=dummy_stdin.fileno())
-            proc.wait()
-            if proc.returncode != 0:
-                default_openmp = False
-        finally:
-            # Ensure `fd` is closed before we remove the temporary file.
-            try:
-                if fd is not None:
-                    os.close(fd)
-            finally:
-                os.remove(path)
-    except OSError, e:
-        default_openmp = False
-del dummy_stdin
-AddConfigVar('openmp',
-             "Enable (or not) parallel computation on the CPU with OpenMP. "
-             "This is the default value used when creating an Op that "
-             "supports OpenMP parallelization. It is preferable to define it "
-             "via the Theano configuration file ~/.theanorc or with the "
-             "environment variable THEANO_FLAGS. Parallelization is only "
-             "done for some operations that implement it, and even for "
-             "operations that implement parallelism, each operation is free "
-             "to respect this flag or not.",
-             BoolParam(default_openmp),
-             in_c_key=False,
-         )
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -55,7 +55,7 @@ from link import \
    Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
 from op import \
-    Op, PureOp, ops_with_inner_function
+    Op, OpenMPOp, PureOp, ops_with_inner_function
 from opt import (Optimizer, optimizer, SeqOptimizer,
    MergeOptimizer, MergeOptMerge,

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -11,8 +11,11 @@ __contact__   = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
+import copy
 import logging
+import os
+import subprocess
+import tempfile
 import warnings
 import theano
@@ -781,3 +784,91 @@ We need that to be able not to run debug checks a number of times that is
 exponential in the nesting level of those ops.
 For instance, Scan will be registered here.
 """
+class OpenMPOp(Op):
+    """All op using OpenMP code should inherit from this Op.
+    This op will check that the compiler support correctly OpenMP code.
+    If not, it will print a warning and disable openmp for this Op.
+    Then it will generate the not OpenMP code.
+    This is needed as EPD on Windows g++ version spec information tell
+    it support OpenMP, but does not include the OpenMP files.
+    We also add the correct compiler flags in c_compile_args.
+    """
+    gxx_support_openmp = None
+    """
+    True/False after we tested this.
+    """
+    def __init__(self, openmp=None):
+        if openmp is None:
+            openmp = theano.config.openmp
+        self.openmp = openmp
+    def c_compile_args(self):
+        if self.openmp:
+            return ['-fopenmp']
+        return []
+    @staticmethod
+    def test_gxx_support():
+        try:
+            code = """
+            #include <omp.h>
+    int main( int argc, const char* argv[] )
+    {
+            int res[10];
+            for(int i=0; i < 10; i++){
+                res[i] = i;
+            }
+    }
+            """
+            fd, path = tempfile.mkstemp(suffix='.c', prefix='test_omp_')
+            dummy_stdin = open(os.devnull)
+            try:
+                os.write(fd, code)
+                os.close(fd)
+                fd = None
+                proc = subprocess.Popen(['g++', '-fopenmp', path],
+                                        stdout=subprocess.PIPE,
+                                        stderr=subprocess.PIPE,
+                                        stdin=dummy_stdin.fileno())
+                proc.wait()
+                if proc.returncode != 0:
+                    default_openmp = False
+            finally:
+                del dummy_stdin
+                # Ensure `fd` is closed before we remove the temporary file.
+                try:
+                    if fd is not None:
+                        os.close(fd)
+                finally:
+                    os.remove(path)
+        except OSError, e:
+            return False
+        return True
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        op = self
+        if self.openmp:
+            if OpenMPOp.gxx_support_openmp is None:
+                OpenMPOp.gxx_support_openmp = OpenMPOp.test_gxx_support()
+                if not OpenMPOp.gxx_support_openmp:
+                    #We want to warn only once.
+                    warnings.warn(
+                        "Your g++ compiler fails to compile OpenMP code. We"
+                        " know this happen with some version of the EPD mingw"
+                        " compiler. We disable openmp everywhere in Theano."
+                        " To remove this warning set the theano flags `openmp`"
+                        " to False.")
+            if OpenMPOp.gxx_support_openmp is False:
+                op = copy.copy(self)
+                op.openmp = False
+                theano.config.openmp = False
+        return super(OpenMPOp, op).make_thunk(node, storage_map,
+                                              compute_map, no_recycling)
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
 import StringIO
 import sys
-if sys.version_info[:2] >= (2, 5):
+from python25 import DefaultOrderedDict
-    from collections import defaultdict
-else:
-    from python25 import defaultdict
 import numpy
 import opt
@@ -29,7 +26,7 @@ class DB(object):
        return self._optimizer_idx
    def __init__(self):
-        self.__db__ = defaultdict(set)
+        self.__db__ = DefaultOrderedDict(set)
        self._names = set()
        self.name = None  # will be reset by register
        #(via obj.name by the thing doing the registering)

--- a/theano/gof/python25.py
+++ b/theano/gof/python25.py
@@ -158,3 +158,176 @@ if sys.version_info[:2] < (2, 6):
 else:
    from itertools import combinations, product
    from sys import maxsize
+if sys.version_info[:2] < (2, 7):
+    # The following implementation of OrderedDict compatible with python 2.4
+    # was taked from http://pypi.python.org/pypi/ordereddict/1.1
+    # It is under the MIT license.
+    # Copyright (c) 2009 Raymond Hettinger
+    #
+    # Permission is hereby granted, free of charge, to any person
+    # obtaining a copy of this software and associated documentation files
+    # (the "Software"), to deal in the Software without restriction,
+    # including without limitation the rights to use, copy, modify, merge,
+    # publish, distribute, sublicense, and/or sell copies of the Software,
+    # and to permit persons to whom the Software is furnished to do so,
+    # subject to the following conditions:
+    #
+    #     The above copyright notice and this permission notice shall be
+    #     included in all copies or substantial portions of the Software.
+    #
+    #     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    #     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+    #     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    #     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+    #     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+    #     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    #     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    #     OTHER DEALINGS IN THE SOFTWARE.
+    from UserDict import DictMixin
+    class OrderedDict(dict, DictMixin):
+        def __init__(self, *args, **kwds):
+            if len(args) > 1:
+                raise TypeError('expected at most 1 arguments, got %d' %
+                                len(args))
+            try:
+                self.__end
+            except AttributeError:
+                self.clear()
+            self.update(*args, **kwds)
+        def clear(self):
+            self.__end = end = []
+            end += [None, end, end]      # sentinel node for doubly linked list
+            self.__map = {}              # key --> [key, prev, next]
+            dict.clear(self)
+        def __setitem__(self, key, value):
+            if key not in self:
+                end = self.__end
+                curr = end[1]
+                curr[2] = end[1] = self.__map[key] = [key, curr, end]
+            dict.__setitem__(self, key, value)
+        def __delitem__(self, key):
+            dict.__delitem__(self, key)
+            key, prev, next = self.__map.pop(key)
+            prev[2] = next
+            next[1] = prev
+        def __iter__(self):
+            end = self.__end
+            curr = end[2]
+            while curr is not end:
+                yield curr[0]
+                curr = curr[2]
+        def __reversed__(self):
+            end = self.__end
+            curr = end[1]
+            while curr is not end:
+                yield curr[0]
+                curr = curr[1]
+        def popitem(self, last=True):
+            if not self:
+                raise KeyError('dictionary is empty')
+            if last:
+                key = reversed(self).next()
+            else:
+                key = iter(self).next()
+            value = self.pop(key)
+            return key, value
+        def __reduce__(self):
+            items = [[k, self[k]] for k in self]
+            tmp = self.__map, self.__end
+            del self.__map, self.__end
+            inst_dict = vars(self).copy()
+            self.__map, self.__end = tmp
+            if inst_dict:
+                return (self.__class__, (items,), inst_dict)
+            return self.__class__, (items,)
+        def keys(self):
+            return list(self)
+        setdefault = DictMixin.setdefault
+        update = DictMixin.update
+        pop = DictMixin.pop
+        values = DictMixin.values
+        items = DictMixin.items
+        iterkeys = DictMixin.iterkeys
+        itervalues = DictMixin.itervalues
+        iteritems = DictMixin.iteritems
+        def __repr__(self):
+            if not self:
+                return '%s()' % (self.__class__.__name__,)
+            return '%s(%r)' % (self.__class__.__name__, self.items())
+        def copy(self):
+            return self.__class__(self)
+        @classmethod
+        def fromkeys(cls, iterable, value=None):
+            d = cls()
+            for key in iterable:
+                d[key] = value
+            return d
+        def __eq__(self, other):
+            if isinstance(other, OrderedDict):
+                if len(self) != len(other):
+                    return False
+                for p, q in  zip(self.items(), other.items()):
+                    if p != q:
+                        return False
+                return True
+            return dict.__eq__(self, other)
+    def __ne__(self, other):
+        return not self == other
+else:
+    from UserDict import DictMixin
+    OrderedDict = collections.OrderedDict
+    from collections import Callable
+class DefaultOrderedDict(OrderedDict):
+    def __init__(self, default_factory=None, *a, **kw):
+        if (default_factory is not None and
+            not callable(default_factory)):
+            raise TypeError('first argument must be callable')
+        OrderedDict.__init__(self, *a, **kw)
+        self.default_factory = default_factory
+    def __getitem__(self, key):
+        try:
+            return OrderedDict.__getitem__(self, key)
+        except KeyError:
+            return self.__missing__(key)
+    def __missing__(self, key):
+        if self.default_factory is None:
+            raise KeyError(key)
+        self[key] = value = self.default_factory()
+        return value
+    def __reduce__(self):
+        if self.default_factory is None:
+            args = tuple()
+        else:
+            args = self.default_factory,
+        return type(self), args, None, None, self.items()
+    def copy(self):
+        return self.__copy__()
+    def __copy__(self):
+        return type(self)(self.default_factory, self)
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -359,6 +359,7 @@ def use(device,
                assert isinstance(device, int)
                gpu_init(device)
                use.device_number = device
+                assert active_device_number() == device
            else:
                # This mean the driver should select the GPU.  As we
                # need to get the device number now, we force the
@@ -379,7 +380,16 @@ def use(device,
            if enable_cuda:
                cuda_enabled = True
            print >> sys.stderr, "Using gpu device %d: %s" % (
-                active_device_number(), active_device_name())
+                use.device_number, active_device_name())
+            if device_properties(use.device_number)['regsPerBlock'] < 16384:
+                # We will try to use too much register per bloc at many places
+                # when there is only 8k register per multi-processor.
+                _logger.warning("You are probably using an old GPU."
+                                " We didn't optimize nor we support those GPU."
+                                " This mean GPU code will be slow AND will"
+                                " crash when we try to use feature/properties"
+                                " that your GPU don't support.")
        except (EnvironmentError, ValueError, RuntimeError), e:
            _logger.error(("ERROR: Not using GPU."
                           " Initialisation of device %s failed:\n%s"),

--- a/theano/sandbox/cuda/neighbours.py
+++ b/theano/sandbox/cuda/neighbours.py
+# This is work in progress
+import theano
+from theano import Op, Apply
+import theano.tensor as T
+from theano.gof import local_optimizer
+from theano.sandbox.cuda import cuda_available, GpuOp
+from theano.sandbox.neighbours import Images2Neibs
+if cuda_available:
+    from theano.sandbox.cuda import CudaNdarrayType
+    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
+    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
+class GpuImages2Neibs(Images2Neibs, GpuOp):
+    def __init__(self, mode='valid'):
+        if mode not in ['valid', 'wrap_centered']:
+            raise NotImplementedError("Only the mode valid and wrap_centered"
+                                      " have been implemented for the op"
+                                      " GpuImages2Neibs")
+        self.mode = mode
+    def make_node(self, ten4, neib_shape, neib_step):
+        assert ten4.dtype == 'float32'
+        if not isinstance(ten4.type, CudaNdarrayType):
+            raise TypeError('ten4 must be cudandarray', ten4)
+        assert ten4.ndim == 4
+        assert neib_shape.ndim == 1
+        assert neib_step.ndim == 1
+        return Apply(self, [ten4, neib_shape, neib_step],
+                     [CudaNdarrayType(broadcastable=(False, False),
+                                      dtype=ten4.type.dtype)()])
+    def c_code_cache_version(self):
+        return (8,)
+    def c_support_code_apply(self, node, nodename):
+        mode = self.mode
+        return """
+//a version that use less register but don't work in all case.
+        static __global__ void k_multi_warp_less_%(nodename)s(
+            const int nb_batch,
+            const int nb_stack,
+            const int height,
+            const int width,
+            const int c,
+            const int d,
+            const int step_x,
+            const int step_y,
+            const int grid_c,
+            const int grid_d,
+            const int stride0, const int stride1,
+            const int stride2, const int stride3,
+            float * global_ten4,
+            const int out_s0, const int out_s1,
+            float * global_out
+        )
+        {
+            const int wrap_centered_idx_shift_x = c/2;
+            const int wrap_centered_idx_shift_y = d/2;
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
+                tblock<nb_batch*nb_stack*grid_c*grid_d;
+                tblock+=gridDim.x*blockDim.z){
+                const int b = tblock%%grid_d;
+                int left = tblock/grid_d;
+                const int a = left%%grid_c;
+                left = left/grid_c;
+                const int s = left%%nb_stack;
+                left = left/nb_stack;
+                const int n = left;
+                if(n>nb_batch)continue;
+                if(s>nb_stack)continue;
+                if(a>grid_c)continue;
+                if(b>grid_d)continue;
+                            int z_row = b + grid_d*(a + grid_c*
+                                                    (s + nb_stack*n));
+                            int i = threadIdx.y;     // loop over c
+                            {
+                                int ten4_2 = i + a * step_x;
+                                if("%(mode)s"=="wrap_centered"){
+                                    ten4_2 -= wrap_centered_idx_shift_x;
+                                    if ( ten4_2 < 0 )
+                                        ten4_2 += height;
+                                    else if (ten4_2 >= height)
+                                        ten4_2 -= height;
+                                }
+                                int j = threadIdx.x;  // loop over d
+                                {
+                                    int ten4_3 = j + b * step_y;
+                                    if("%(mode)s"=="wrap_centered"){
+                                        ten4_3 -= wrap_centered_idx_shift_y;
+                                        if ( ten4_3 < 0 )
+                                            ten4_3 += width;
+                                        else if (ten4_3 >= width)
+                                            ten4_3 -= width;
+                                    }
+                                    int ten4_idx = stride3*ten4_3 +
+                                                   stride2*ten4_2 +
+                                                   stride1*s + stride0*n;
+                                    int z_col = j + d * i;
+                                    int z_idx = z_col * out_s1 +
+                                                z_row * out_s0;
+                                    global_out[z_idx] = global_ten4[ten4_idx];
+                                }
+                            }
+            }
+        }
+        static __global__ void k_multi_warp_%(nodename)s(
+            const int nb_batch,
+            const int nb_stack,
+            const int height,
+            const int width,
+            const int c,
+            const int d,
+            const int step_x,
+            const int step_y,
+            const int grid_c,
+            const int grid_d,
+            const int stride0, const int stride1,
+            const int stride2, const int stride3,
+            float * global_ten4,
+            const int out_s0, const int out_s1,
+            float * global_out
+        )
+        {
+            const int wrap_centered_idx_shift_x = c/2;
+            const int wrap_centered_idx_shift_y = d/2;
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
+                tblock<nb_batch*nb_stack*grid_c*grid_d;
+                tblock+=gridDim.x*blockDim.z){
+                const int b = tblock%%grid_d;
+                int left = tblock/grid_d;
+                const int a = left%%grid_c;
+                left = left/grid_c;
+                const int s = left%%nb_stack;
+                left = left/nb_stack;
+                const int n = left;
+                if(n>nb_batch)continue;
+                if(s>nb_stack)continue;
+                if(a>grid_c)continue;
+                if(b>grid_d)continue;
+                            int z_row = b + grid_d*(a + grid_c*
+                                                    (s + nb_stack*n));
+                            // loop over c
+                            for (int i = threadIdx.y; i < c; i+=blockDim.y)
+                            {
+                                int ten4_2 = i + a * step_x;
+                                if("%(mode)s"=="wrap_centered"){
+                                    ten4_2 -= wrap_centered_idx_shift_x;
+                                    if ( ten4_2 < 0 )
+                                        ten4_2 += height;
+                                    else if (ten4_2 >= height)
+                                        ten4_2 -= height;
+                                }
+                                // loop over d
+                                for (int j = threadIdx.x; j < d; j+=blockDim.x)
+                                {
+                                    int ten4_3 = j + b * step_y;
+                                    if("%(mode)s"=="wrap_centered"){
+                                        ten4_3 -= wrap_centered_idx_shift_y;
+                                        if ( ten4_3 < 0 )
+                                            ten4_3 += width;
+                                        else if (ten4_3 >= width)
+                                            ten4_3 -= width;
+                                    }
+                                    int ten4_idx = stride3*ten4_3 +
+                                                   stride2*ten4_2 +
+                                                   stride1*s + stride0*n;
+                                    int z_col = j + d * i;
+                                    int z_idx = z_col * out_s1 +
+                                                z_row * out_s0;
+                                    global_out[z_idx] = global_ten4[ten4_idx];
+                                }
+                            }
+            }
+        }
+        """ % locals()
+    def c_code(self, node, name, inp, out, sub):
+        ten4, neib_shape, neib_step = inp
+        z, = out
+        fail = sub['fail']
+        mode = self.mode
+        return """
+#ifndef CEIL_INTDIV
+#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
+#endif
+        int grid_c = -1;
+        int grid_d = -1;
+        {
+            if (%(ten4)s->nd != 4)
+            {
+                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+                %(fail)s;
+            }
+            if (%(neib_shape)s->nd != 1)
+            {
+                PyErr_Format(PyExc_TypeError, "unis wrong rank");
+                %(fail)s;
+            }
+            if (%(neib_shape)s->dimensions[0] != 2)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "neib_shape has to contain two elements");
+                %(fail)s;
+            }
+            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
+                                                     %(neib_shape)s, 0);
+            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
+                                                     %(neib_shape)s, 1);
+            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*)
+                                         PyArray_GETPTR1(%(neib_step)s, 0);
+            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*)
+                                         PyArray_GETPTR1(%(neib_step)s, 1);
+            if ( "%(mode)s" == "wrap_centered") {
+                if (c%%2!=1 || d%%2!=1){
+                    PyErr_Format(PyExc_TypeError,
+        "Images2Neibs: in mode wrap_centered need patch with odd shapes");
+                    %(fail)s;
+                }
+                if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] < c ||
+                     CudaNdarray_HOST_DIMS(%(ten4)s)[3] < d)
+                {
+                    PyErr_Format(PyExc_TypeError,
+                                 "Images2Neibs: in wrap_centered mode, don't"
+                                 " support image shapes smaller then the patch"
+                                 " shapes: neib_shape=(%%d,%%d),"
+                                 " ten4[2:]=[%%d,%%d]",
+                                 c, d, CudaNdarray_HOST_DIMS(%(ten4)s)[2],
+                                 CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
+                    %(fail)s;
+                }
+                grid_c = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]),
+                                     step_x);
+                grid_d = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]),
+                                     step_y);
+            }else if ( "%(mode)s" == "valid") {
+                if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[2] < c) ||
+                     ((((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
+                {
+                    PyErr_Format(PyExc_TypeError,
+                                 "neib_shape[0]=%%d, neib_step[0]=%%d and"
+                                 " ten4.shape[2]=%%d not consistent",
+                                 c, step_x,
+                                 CudaNdarray_HOST_DIMS(%(ten4)s)[2]);
+                    %(fail)s;
+                }
+                if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[3] < d) ||
+                     ((((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
+                {
+                    PyErr_Format(PyExc_TypeError,
+                                 "neib_shape[1]=%%d, neib_step[1]=%%d and"
+                                 " ten4.shape[3]=%%d not consistent",
+                                 d, step_y,
+                                 CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
+                    %(fail)s;
+                }
+                //number of patch in height
+                grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
+                //number of patch in width
+                grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
+            }else{
+                PyErr_Format(PyExc_TypeError,
+                             "Images2Neibs: unknow mode '%(mode)s'");
+                 %(fail)s;
+            }
+            // new dimensions for z
+            const int z_dim1 = c * d;
+            const int z_dim0 =  grid_c
+                                * grid_d
+                                * CudaNdarray_HOST_DIMS(%(ten4)s)[1]
+                                * CudaNdarray_HOST_DIMS(%(ten4)s)[0];
+            if ((NULL == %(z)s)
+                || (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0)
+                || (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1))
+            {
+                Py_XDECREF(%(z)s);
+                npy_intp dims[2];
+                dims[0] = z_dim0;
+                dims[1] = z_dim1;
+                %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
+                if (!%(z)s)
+                {
+                    PyErr_SetString(PyExc_MemoryError,
+                                    "failed to alloc z output");
+                    %(fail)s;
+                }
+            }
+        }
+        { // NESTED SCOPE
+            const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0];
+            const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1];
+            const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2];
+            const int width = CudaNdarray_HOST_DIMS(%(ten4)s)[3];
+            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
+                                                     %(neib_shape)s, 0);
+            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
+                                                     %(neib_shape)s, 1);
+            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*)
+                                         PyArray_GETPTR1(%(neib_step)s, 0);
+            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*)
+                                         PyArray_GETPTR1(%(neib_step)s, 1);
+            dim3 n_threads(d,c,1);
+            //Their is a max of 512 threads per blocks
+            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
+            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
+            //Make bigger block to have better memory access pattern and
+            //a higher core utilisation. for smaller patch size
+            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
+                  n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
+                n_threads.z++;
+            }
+            int nb_block;
+            if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
+                nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
+            else
+                nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
+            dim3 n_blocks(std::min(32*1024,nb_block));
+            int n_shared = 0;
+            void (*f)(int, int, int ,int,
+                      int, int, int ,int,
+                      int, int,
+                      int, int, int, int,
+                      float*,
+                      int, int,
+                      float*);
+            if(n_threads.x==d && n_threads.y==c){
+                f = k_multi_warp_less_%(name)s;
+            }else{
+                f = k_multi_warp_%(name)s;
+            }
+            f<<<n_blocks, n_threads, n_shared>>>(
+                nb_batch,
+                nb_stack,
+                height, width,
+                c, d, step_x, step_y,
+                grid_c, grid_d,
+                CudaNdarray_HOST_STRIDES(%(ten4)s)[0],
+                CudaNdarray_HOST_STRIDES(%(ten4)s)[1],
+                CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
+                CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
+                CudaNdarray_DEV_DATA(%(ten4)s),
+                CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(z)s)[1],
+                CudaNdarray_DEV_DATA(%(z)s)
+            );
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+                             "Cuda error: %%s: %%s. (grid: %%i x %%i;"
+                             " block: %%i x %%i x %%i; shared: %%i)\\n",
+                    "k_multi_warp_%(name)s",
+                    cudaGetErrorString(sts),
+                    n_blocks.x,
+                    n_blocks.y,
+                    n_threads.x,
+                    n_threads.y,
+                    n_threads.z,
+                    n_shared);
+                %(fail)s;
+            }
+        } // END NESTED SCOPE
+        """ % locals()
+def gpu_images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
+    return GpuImages2Neibs(mode)(ten4, neib_shape, neib_step)
+@local_optimizer()
+def use_gpu_images2neibs(node):
+    if (type(node.op) is Images2Neibs and
+        node.op.mode in ['valid', 'wrap_centered']):
+        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
+                                               node.inputs[1], node.inputs[2],
+                                               mode=node.op.mode))]
+if cuda_available:
+    register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/sandbox/cuda/tests/test_neighbours.py
+++ b/theano/sandbox/cuda/tests/test_neighbours.py
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import numpy
+import theano
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+import theano.sandbox.test_neighbours
+from theano.sandbox.cuda.neighbours import GpuImages2Neibs
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+class T_GpuImages2Neibs(theano.sandbox.test_neighbours.T_Images2Neibs):
+    def __init__(self, name):
+        self.mode = mode_with_gpu
+        self.op = GpuImages2Neibs
+        return super(T_GpuImages2Neibs, self).__init__(name)
+if __name__ == '__main__':
+    unittest.main()
--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
+"""
+TODO: implement Images2Neibs.{perform,infer_shape}() methods
+"""
 import theano
 from theano import Op, Apply
 import theano.tensor as T
 from theano.gof import local_optimizer
-from theano.sandbox.cuda import cuda_available, GpuOp
 from theano.gradient import grad_not_implemented
-if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
-    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
-    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
 class Images2Neibs(Op):
    def __init__(self, mode='valid'):
@@ -84,8 +82,17 @@ class Images2Neibs(Op):
    def grad(self, inp, grads):
        x, neib_shape, neib_step = inp
        gz, = grads
-        return [ grad_not_implemented(self, i, ip) \
-                for i, ip in enumerate(inp) ]
+        if self.mode in ['valid', 'ignore_borders']:
+            if (neib_shape is neib_step or
+                neib_shape == neib_step or
+                # Theano Constant == do not compare the data
+                # the equals function do that.
+                (hasattr(neib_shape, "equals") and
+                 neib_shape.equals(neib_step))):
+                return [neibs2images(gz, neib_shape, x.shape, mode=self.mode),
+                        None, None]
+        return [grad_not_implemented(self, 0, x), None, None]
    def c_code_cache_version(self):
        return (5,)
@@ -293,358 +300,13 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
            pad_shape = list(output_4d.shape)
            pad_shape[d] = original_shape[d] - valid_shape[d]
            output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d)
-    else:
+    elif mode == 'valid':
+        # TODO: we do not implement all mode with this code.
+        # Add a check for the good cases.
        output_4d = output_2d.reshape(original_shape)
+    else:
+        raise NotImplementedError("neibs2images do not support mode=%s" % mode)
    return output_4d
-# This is work in progress
-class GpuImages2Neibs(Images2Neibs, GpuOp):
-    def __init__(self, mode='valid'):
-        if mode not in ['valid', 'wrap_centered']:
-            raise NotImplementedError("Only the mode valid and wrap_centered"
-                                      " have been implemented for the op"
-                                      " GpuImages2Neibs")
-        self.mode = mode
-    def make_node(self, ten4, neib_shape, neib_step):
-        assert ten4.dtype == 'float32'
-        if not isinstance(ten4.type, CudaNdarrayType):
-            raise TypeError('ten4 must be cudandarray', ten4)
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_step.ndim == 1
-        return Apply(self, [ten4, neib_shape, neib_step],
-                     [CudaNdarrayType(broadcastable=(False, False),
-                                      dtype=ten4.type.dtype)()])
-    def c_code_cache_version(self):
-        return (8,)
-    def c_support_code_apply(self, node, nodename):
-        mode = self.mode
-        return """
-//a version that use less register but don't work in all case.
-        static __global__ void k_multi_warp_less_%(nodename)s(
-            const int nb_batch,
-            const int nb_stack,
-            const int height,
-            const int width,
-            const int c,
-            const int d,
-            const int step_x,
-            const int step_y,
-            const int grid_c,
-            const int grid_d,
-            const int stride0, const int stride1, const int stride2, const int stride3,
-            float * global_ten4,
-            const int out_s0, const int out_s1,
-            float * global_out
-        )
-        {
-            const int wrap_centered_idx_shift_x = c/2;
-            const int wrap_centered_idx_shift_y = d/2;
-            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
-                const int b = tblock%%grid_d;
-                int left = tblock/grid_d;
-                const int a = left%%grid_c;
-                left = left/grid_c;
-                const int s = left%%nb_stack;
-                left = left/nb_stack;
-                const int n = left;
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                            int i = threadIdx.y;     // loop over c
-                            {
-                                int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
-                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                    if ( ten4_2 < 0 ) ten4_2 += height;
-                                    else if (ten4_2 >= height) ten4_2 -= height;
-                                }
-                                int j = threadIdx.x;  // loop over d
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
-                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                        if ( ten4_3 < 0 ) ten4_3 += width;
-                                        else if (ten4_3 >= width) ten4_3 -= width;
-                                    }
-                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
-                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
-                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
-                                    int z_col = j + d * i;
-                                    int z_idx = z_col * out_s1 + z_row * out_s0;
-                                    global_out[z_idx] = global_ten4[ten4_idx];
-                                }
-                            }
-            }
-        }
-        static __global__ void k_multi_warp_%(nodename)s(
-            const int nb_batch,
-            const int nb_stack,
-            const int height,
-            const int width,
-            const int c,
-            const int d,
-            const int step_x,
-            const int step_y,
-            const int grid_c,
-            const int grid_d,
-            const int stride0, const int stride1, const int stride2, const int stride3,
-            float * global_ten4,
-            const int out_s0, const int out_s1,
-            float * global_out
-        )
-        {
-            const int wrap_centered_idx_shift_x = c/2;
-            const int wrap_centered_idx_shift_y = d/2;
-            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
-                const int b = tblock%%grid_d;
-                int left = tblock/grid_d;
-                const int a = left%%grid_c;
-                left = left/grid_c;
-                const int s = left%%nb_stack;
-                left = left/nb_stack;
-                const int n = left;
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                            for (int i = threadIdx.y; i < c; i+=blockDim.y)     // loop over c
-                            {
-                                int ten4_2 = i + a * step_x;
-                                if("%(mode)s"=="wrap_centered"){
-                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                    if ( ten4_2 < 0 ) ten4_2 += height;
-                                    else if (ten4_2 >= height) ten4_2 -= height;
-                                }
-                                for (int j = threadIdx.x; j < d; j+=blockDim.x)  // loop over d
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if("%(mode)s"=="wrap_centered"){
-                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                        if ( ten4_3 < 0 ) ten4_3 += width;
-                                        else if (ten4_3 >= width) ten4_3 -= width;
-                                    }
-                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
-                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
-                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
-                                    int z_col = j + d * i;
-                                    int z_idx = z_col * out_s1 + z_row * out_s0;
-                                    global_out[z_idx] = global_ten4[ten4_idx];
-                                }
-                            }
-            }
-        }
-        """ % locals()
-    def c_code(self, node, name, inp, out, sub):
-        ten4, neib_shape, neib_step = inp
-        z, = out
-        fail = sub['fail']
-        mode = self.mode
-        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-        int grid_c = -1;
-        int grid_d = -1;
-        {
-            if (%(ten4)s->nd != 4)
-            {
-                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
-                %(fail)s;
-            }
-            if (%(neib_shape)s->nd != 1)
-            {
-                PyErr_Format(PyExc_TypeError, "unis wrong rank");
-                %(fail)s;
-            }
-            if (%(neib_shape)s->dimensions[0] != 2)
-            {
-                PyErr_Format(PyExc_ValueError, "neib_shape has to contain two elements");
-                %(fail)s;
-            }
-            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
-            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
-            if ( "%(mode)s" == "wrap_centered") {
-                if (c%%2!=1 || d%%2!=1){
-                    PyErr_Format(PyExc_TypeError, "Images2Neibs: in mode wrap_centered need patch with odd shapes");
-                    %(fail)s;
-                }
-                if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] < c || CudaNdarray_HOST_DIMS(%(ten4)s)[3] < d)
-                {
-                    PyErr_Format(PyExc_TypeError, "Images2Neibs: in wrap_centered mode, don't support image shapes smaller then the patch shapes: neib_shape=(%%d,%%d), ten4[2:]=[%%d,%%d]",
-                                 c, d, CudaNdarray_HOST_DIMS(%(ten4)s)[2], CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                grid_c = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]),
-                                     step_x);
-                grid_d = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]),
-                                     step_y);
-            }else if ( "%(mode)s" == "valid") {
-                if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[2] < c) ||( (((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "neib_shape[0]=%%d, neib_step[0]=%%d and ten4.shape[2]=%%d not consistent",
-                                 c, step_x, CudaNdarray_HOST_DIMS(%(ten4)s)[2]);
-                    %(fail)s;
-                }
-                if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[3] < d) ||( (((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "neib_shape[1]=%%d, neib_step[1]=%%d and ten4.shape[3]=%%d not consistent",
-                                 d, step_y, CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in height
-                grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y); //number of patch in width
-            }else{
-                PyErr_Format(PyExc_TypeError, "Images2Neibs: unknow mode '%(mode)s'");
-                 %(fail)s;
-            }
-            // new dimensions for z
-            const int z_dim1 = c * d;
-            const int z_dim0 =  grid_c
-                                * grid_d
-                                * CudaNdarray_HOST_DIMS(%(ten4)s)[1]
-                                * CudaNdarray_HOST_DIMS(%(ten4)s)[0];
-            if ((NULL == %(z)s)
-                || (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0)
-                || (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1))
-            {
-                Py_XDECREF(%(z)s);
-                npy_intp dims[2];
-                dims[0] = z_dim0;
-                dims[1] = z_dim1;
-                %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
-                if (!%(z)s)
-                {
-                    PyErr_SetString(PyExc_MemoryError,
-                                    "failed to alloc z output");
-                    %(fail)s;
-                }
-            }
-        }
-        { // NESTED SCOPE
-            const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0];
-            const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1];
-            const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2];
-            const int width = CudaNdarray_HOST_DIMS(%(ten4)s)[3];
-            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
-            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
-            dim3 n_threads(d,c,1);
-            //Their is a max of 512 threads per blocks
-            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
-            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
-            //Make bigger block to have better memory access pattern and a higher core utilisation.
-            //for smaller patch size
-            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
-                n_threads.z++;
-            }
-            int nb_block;
-            if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
-            else
-                nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
-            dim3 n_blocks(std::min(32*1024,nb_block));
-            int n_shared = 0;
-            void (*f)(int, int, int ,int,
-                      int, int, int ,int,
-                      int, int,
-                      int, int, int, int,
-                      float*,
-                      int, int,
-                      float*);
-            if(n_threads.x==d && n_threads.y==c){
-                f = k_multi_warp_less_%(name)s;
-            }else{
-                f = k_multi_warp_%(name)s;
-            }
-            f<<<n_blocks, n_threads, n_shared>>>(
-                nb_batch,
-                nb_stack,
-                height, width,
-                c, d, step_x, step_y,
-                grid_c, grid_d,
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[0],
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[1],
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
-                CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
-                CudaNdarray_DEV_DATA(%(ten4)s),
-                CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                CudaNdarray_HOST_STRIDES(%(z)s)[1],
-                CudaNdarray_DEV_DATA(%(z)s)
-            );
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s. (grid: %%i x %%i;"
-                             " block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
-            }
-        } // END NESTED SCOPE
-        """ % locals()
-def gpu_images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
-    return GpuImages2Neibs(mode)(ten4, neib_shape, neib_step)
-@local_optimizer()
-def use_gpu_images2neibs(node):
-    if type(node.op) is Images2Neibs:
-        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
-                                               node.inputs[1], node.inputs[2],
-                                               mode=node.op.mode))]
-if cuda_available:
-    register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
 import numpy
 import theano
 from theano import shared, function
-import theano.tensor as T
-from neighbours import (images2neibs, neibs2images,
-                        Images2Neibs, GpuImages2Neibs)
-# Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda
 from theano.gof.python25 import any
+import theano.tensor as T
+from neighbours import images2neibs, neibs2images, Images2Neibs
 from theano.tests import unittest_tools
 if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode(
        'FAST_RUN').excluding('gpu')
 else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
-def test_neibs():
+class T_Images2Neibs(unittest_tools.InferShapeTester):
-    shape = (100, 40, 18, 18)
+    def __init__(self, name):
-    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
+        self.mode = mode_without_gpu
-    neib_shape = T.as_tensor_variable((2, 2))
+        self.op = Images2Neibs
+        return super(T_Images2Neibs, self).__init__(name)
+    def test_neibs(self):
+        for shape, pshape in [((100, 40, 18, 18), (2, 2)),
+                              ((100, 40, 6, 18), (3, 2)),
+                              ((10, 40, 66, 66), (33, 33)),
+                              ((10, 40, 68, 66), (34, 33))
+                                  ]:
+            for border in ['valid', 'ignore_borders']:
+                images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
+                neib_shape = T.as_tensor_variable(pshape)
+                f = function([], images2neibs(images, neib_shape, mode=border),
+                             mode=self.mode)
+                #print images.get_value(borrow=True)
+                neibs = f()
+                #print neibs
+                g = function([], neibs2images(neibs, neib_shape, images.shape),
+                             mode=self.mode)
+                assert any([isinstance(node.op, self.op)
+                            for node in f.maker.fgraph.toposort()])
+                #print g()
+                assert numpy.allclose(images.get_value(borrow=True), g())
+    def test_neibs_manual(self):
+        shape = (2, 3, 4, 4)
+        images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
+        neib_shape = T.as_tensor_variable((2, 2))
+        for border in ['valid', 'ignore_borders']:
+            f = function([], images2neibs(images, neib_shape, mode=border),
+                         mode=self.mode)
+            #print images.get_value(borrow=True)
+            neibs = f()
+            #print neibs
+            assert numpy.allclose(neibs,[[ 0,  1,  4,  5],
+               [ 2,  3,  6,  7],
+               [ 8,  9, 12, 13],
+               [10, 11, 14, 15],
+               [16, 17, 20, 21],
+               [18, 19, 22, 23],
+               [24, 25, 28, 29],
+               [26, 27, 30, 31],
+               [32, 33, 36, 37],
+               [34, 35, 38, 39],
+               [40, 41, 44, 45],
+               [42, 43, 46, 47],
+               [48, 49, 52, 53],
+               [50, 51, 54, 55],
+               [56, 57, 60, 61],
+               [58, 59, 62, 63],
+               [64, 65, 68, 69],
+               [66, 67, 70, 71],
+               [72, 73, 76, 77],
+               [74, 75, 78, 79],
+               [80, 81, 84, 85],
+               [82, 83, 86, 87],
+               [88, 89, 92, 93],
+               [90, 91, 94, 95]])
+            g = function([], neibs2images(neibs, neib_shape, images.shape),
+                         mode=self.mode)
+            assert numpy.allclose(images.get_value(borrow=True), g())
+    def test_neibs_manual_step(self):
+        shape = (2, 3, 5, 5)
+        images = shared(numpy.asarray(numpy.arange(numpy.prod(
+                        shape)).reshape(shape), dtype='float32'))
+        neib_shape = T.as_tensor_variable((3, 3))
+        neib_step = T.as_tensor_variable((2, 2))
+        for border in ['valid', 'ignore_borders']:
+            f = function([], images2neibs(images, neib_shape, neib_step, mode=border),
+                         mode=self.mode)
-    f = function([], images2neibs(images, neib_shape), mode=mode_without_gpu)
+            neibs = f()
+            assert self.op in [type(node.op)
+                               for node in f.maker.fgraph.toposort()]
+            assert numpy.allclose(neibs,
+          [[  0,   1,   2,   5,   6,   7,  10,  11,  12],
+           [  2,   3,   4,   7,   8,   9,  12,  13,  14],
+           [ 10,  11,  12,  15,  16,  17,  20,  21,  22],
+           [ 12,  13,  14,  17,  18,  19,  22,  23,  24],
+           [ 25,  26,  27,  30,  31,  32,  35,  36,  37],
+           [ 27,  28,  29,  32,  33,  34,  37,  38,  39],
+           [ 35,  36,  37,  40,  41,  42,  45,  46,  47],
+           [ 37,  38,  39,  42,  43,  44,  47,  48,  49],
+           [ 50,  51,  52,  55,  56,  57,  60,  61,  62],
+           [ 52,  53,  54,  57,  58,  59,  62,  63,  64],
+           [ 60,  61,  62,  65,  66,  67,  70,  71,  72],
+           [ 62,  63,  64,  67,  68,  69,  72,  73,  74],
+           [ 75,  76,  77,  80,  81,  82,  85,  86,  87],
+           [ 77,  78,  79,  82,  83,  84,  87,  88,  89],
+           [ 85,  86,  87,  90,  91,  92,  95,  96,  97],
+           [ 87,  88,  89,  92,  93,  94,  97,  98,  99],
+           [100, 101, 102, 105, 106, 107, 110, 111, 112],
+           [102, 103, 104, 107, 108, 109, 112, 113, 114],
+           [110, 111, 112, 115, 116, 117, 120, 121, 122],
+           [112, 113, 114, 117, 118, 119, 122, 123, 124],
+           [125, 126, 127, 130, 131, 132, 135, 136, 137],
+           [127, 128, 129, 132, 133, 134, 137, 138, 139],
+           [135, 136, 137, 140, 141, 142, 145, 146, 147],
+           [137, 138, 139, 142, 143, 144, 147, 148, 149]])
+            #neibs2images do not seam to support step != neib_shape
+            #g = function([], neibs2images(neibs, neib_shape, images.shape),
+            #             mode=self.mode)
+            #print g()
+            #assert numpy.allclose(images.get_value(borrow=True), g())
-    #print images.get_value(borrow=True)
+    def test_neibs_bad_shape(self):
-    neibs = f()
+        shape = (2, 3, 10, 10)
-    #print neibs
+        images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
-    g = function([], neibs2images(neibs, neib_shape, images.shape),
-                 mode=mode_without_gpu)
-    #print g()
+        for neib_shape in [(3, 2), (2, 3)]:
-    assert numpy.allclose(images.get_value(borrow=True), g())
+            neib_shape = T.as_tensor_variable(neib_shape)
+            f = function([], images2neibs(images, neib_shape), mode=self.mode)
+            self.assertRaises(TypeError, f)
+            #Test that ignore border work in that case.
+            f = function([], images2neibs(images, neib_shape, mode='ignore_borders'),
+                         mode=self.mode)
+            f()
-def test_neibs_bad_shape():
+    def test_neibs_wrap_centered_step_manual(self):
-    shape = (2, 3, 10, 10)
-    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
+        expected1 = [[24, 20, 21,  4,  0,  1,  9,  5,  6],
+                     [21, 22, 23,  1,  2,  3,  6,  7,  8],
+                     [23, 24, 20,  3,  4,  0,  8,  9,  5],
+                     [ 9,  5,  6, 14, 10, 11, 19, 15, 16],
+                     [ 6,  7,  8, 11, 12, 13, 16, 17, 18],
+                     [ 8,  9,  5, 13, 14, 10, 18, 19, 15],
+                     [19, 15, 16, 24, 20, 21,  4,  0,  1],
+                     [16, 17, 18, 21, 22, 23,  1,  2,  3],
+                     [18, 19, 15, 23, 24, 20,  3,  4,  0]]
+        expected2 = [[ 24,  20,  21,   4,   0,   1,   9,   5,   6],
+                     [ 22,  23,  24,   2,   3,   4,   7,   8,   9],
+                     [ 14,  10,  11,  19,  15,  16,  24,  20,  21],
+                     [ 12,  13,  14,  17,  18,  19,  22,  23,  24]]
+        expected3 = [[19, 15, 16, 24, 20, 21,  4,  0,  1,  9,  5,  6, 14, 10, 11],
+                     [17, 18, 19, 22, 23, 24,  2,  3,  4,  7,  8,  9, 12, 13, 14],
+                     [ 9,  5,  6, 14, 10, 11, 19, 15, 16, 24, 20, 21,  4,  0,  1],
+                     [ 7,  8,  9, 12, 13, 14, 17, 18, 19, 22, 23, 24,  2,  3,  4]]
+        expected4 = [[23, 24, 20, 21, 22,  3,  4,  0,  1,  2,  8,  9,  5,  6,  7],
+                     [21, 22, 23, 24, 20,  1,  2,  3,  4,  0,  6,  7,  8,  9,  5],
+                     [13, 14, 10, 11, 12, 18, 19, 15, 16, 17, 23, 24, 20, 21, 22],
+                     [11, 12, 13, 14, 10, 16, 17, 18, 19, 15, 21, 22, 23, 24, 20]]
+        expected5 = [[24, 20, 21,  4,  0,  1,  9,  5,  6],
+                     [22, 23, 24,  2,  3,  4,  7,  8,  9],
+                     [ 9,  5,  6, 14, 10, 11, 19, 15, 16],
+                     [ 7,  8,  9, 12, 13, 14, 17, 18, 19],
+                     [19, 15, 16, 24, 20, 21,  4,  0,  1],
+                     [17, 18, 19, 22, 23, 24,  2,  3,  4]]
+        expected6 = [[24, 20, 21,  4,  0,  1,  9,  5,  6],
+                     [21, 22, 23,  1,  2,  3,  6,  7,  8],
+                     [23, 24, 20,  3,  4,  0,  8,  9,  5],
+                     [14, 10, 11, 19, 15, 16, 24, 20, 21],
+                     [11, 12, 13, 16, 17, 18, 21, 22, 23],
+                     [13, 14, 10, 18, 19, 15, 23, 24, 20]]
+        #TODO test discontinous image
+        for shp_idx, (shape, neib_shape, neib_step, expected) in enumerate([
+            [(7, 8, 5, 5), (3, 3), (2, 2), expected1],
+            [(7, 8, 5, 5), (3, 3), (3, 3), expected2],
+            [(7, 8, 5, 5), (5, 3), (3, 3), expected3],
+            [(7, 8, 5, 5), (3, 5), (3, 3), expected4],
+            [(80, 90, 5, 5), (3, 3), (2, 3), expected5],
+            [(1025, 9, 5, 5), (3, 3), (3, 2), expected6],
+            [(1, 1, 5, 1035), (3, 3), (3, 3), None],
+            [(1, 1, 1045, 5), (3, 3), (3, 3), None],
+            ]):
+            images = shared(numpy.asarray(numpy.arange(numpy.prod(
+                            shape)).reshape(shape), dtype='float32'))
+            neib_shape = T.as_tensor_variable(neib_shape)
+            neib_step = T.as_tensor_variable(neib_step)
+            expected = numpy.asarray(expected)
-    for neib_shape in [(3, 2), (2, 3)]:
+            f = function([], images2neibs(images, neib_shape, neib_step,
-        neib_shape = T.as_tensor_variable(neib_shape)
+                                          mode="wrap_centered"),
+                         mode=self.mode)
+            neibs = f()
-        try:
+            if expected.size > 1:
-            f = function([], images2neibs(images, neib_shape),
+                for i in range(shape[0] * shape[1]):
-                         mode=mode_without_gpu)
+                    assert numpy.allclose(neibs[i * expected.shape[0]:
-            f()
+                                              (i + 1) * expected.shape[0], :],
-            assert False, "An error was expected"
+                                          expected + 25 * i), mode_idx
-        except TypeError:
-            pass
+            assert self.op in [type(node.op)
+                               for node in f.maker.fgraph.toposort()]
+            #g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode)
+            #TODO: why this is commented?
+            #assert numpy.allclose(images.get_value(borrow=True), g())
-def test_neibs_bad_shape_warp_centered():
+    def test_neibs_bad_shape_wrap_centered(self):
-    shape = (2, 3, 10, 10)
+        shape = (2, 3, 10, 10)
-    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
+        images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
-    for neib_shape in [(3, 2), (2, 3)]:
+        for neib_shape in [(3, 2), (2, 3)]:
-        neib_shape = T.as_tensor_variable(neib_shape)
+            neib_shape = T.as_tensor_variable(neib_shape)
-        try:
            f = function([], images2neibs(images, neib_shape,
                                          mode="wrap_centered"),
-                         mode=mode_without_gpu)
+                         mode=self.mode)
-            f()
+            self.assertRaises(TypeError, f)
-            assert False, "An error was expected"
-        except TypeError:
-            pass
-    shape = (2, 3, 2, 3)
-    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
-    neib_shape = T.as_tensor_variable((3, 3))
-    for shape in [(2, 3, 2, 3), (2, 3, 3, 2)]:
+        for shape in [(2, 3, 2, 3), (2, 3, 3, 2)]:
-        try:
+            images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
+            neib_shape = T.as_tensor_variable((3, 3))
            f = function([], images2neibs(images, neib_shape,
                                          mode="wrap_centered"),
-                         mode=mode_without_gpu)
+                         mode=self.mode)
-            f()
+            self.assertRaises(TypeError, f)
-            assert False, "An error was expected"
-        except TypeError:
-            pass
-    # Test a valid shapes
-    shape = (2, 3, 3, 3)
-    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
-    neib_shape = T.as_tensor_variable((3, 3))
-    f = function([], images2neibs(images, neib_shape, mode="wrap_centered"),
-                 mode=mode_without_gpu)
-    f()
-def test_neibs_manual():
-    shape = (2, 3, 4, 4)
-    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
-    neib_shape = T.as_tensor_variable((2, 2))
-    f = function([], images2neibs(images, neib_shape), mode=mode_without_gpu)
-    #print images.get_value(borrow=True)
-    neibs = f()
-    #print neibs
-    assert numpy.allclose(neibs,[[ 0,  1,  4,  5],
-       [ 2,  3,  6,  7],
-       [ 8,  9, 12, 13],
-       [10, 11, 14, 15],
-       [16, 17, 20, 21],
-       [18, 19, 22, 23],
-       [24, 25, 28, 29],
-       [26, 27, 30, 31],
-       [32, 33, 36, 37],
-       [34, 35, 38, 39],
-       [40, 41, 44, 45],
-       [42, 43, 46, 47],
-       [48, 49, 52, 53],
-       [50, 51, 54, 55],
-       [56, 57, 60, 61],
-       [58, 59, 62, 63],
-       [64, 65, 68, 69],
-       [66, 67, 70, 71],
-       [72, 73, 76, 77],
-       [74, 75, 78, 79],
-       [80, 81, 84, 85],
-       [82, 83, 86, 87],
-       [88, 89, 92, 93],
-       [90, 91, 94, 95]])
-    g = function([], neibs2images(neibs, neib_shape, images.shape),
-                 mode=mode_without_gpu)
-    #print g()
-    assert numpy.allclose(images.get_value(borrow=True), g())
-def test_neibs_step_manual():
-    shape = (2, 3, 5, 5)
-    images = shared(numpy.asarray(numpy.arange(numpy.prod(
-                    shape)).reshape(shape), dtype='float32'))
-    neib_shape = T.as_tensor_variable((3, 3))
-    neib_step = T.as_tensor_variable((2, 2))
-    modes = [mode_without_gpu]
-    if cuda.cuda_available:
-        modes.append(mode_with_gpu)
-    for mode_idx, mode in enumerate(modes):
-        f = function([], images2neibs(images, neib_shape, neib_step),
-                     mode=mode)
-    #print images.get_value(borrow=True)
-        neibs = f()
-        if mode_idx == 0:
-            assert Images2Neibs in [type(node.op)
-                                    for node in f.maker.fgraph.toposort()]
-        elif mode_idx == 1:
-            assert GpuImages2Neibs in [type(node.op)
-                                       for node in f.maker.fgraph.toposort()]
-        assert numpy.allclose(neibs,
-      [[  0,   1,   2,   5,   6,   7,  10,  11,  12],
-       [  2,   3,   4,   7,   8,   9,  12,  13,  14],
-       [ 10,  11,  12,  15,  16,  17,  20,  21,  22],
-       [ 12,  13,  14,  17,  18,  19,  22,  23,  24],
-       [ 25,  26,  27,  30,  31,  32,  35,  36,  37],
-       [ 27,  28,  29,  32,  33,  34,  37,  38,  39],
-       [ 35,  36,  37,  40,  41,  42,  45,  46,  47],
-       [ 37,  38,  39,  42,  43,  44,  47,  48,  49],
-       [ 50,  51,  52,  55,  56,  57,  60,  61,  62],
-       [ 52,  53,  54,  57,  58,  59,  62,  63,  64],
-       [ 60,  61,  62,  65,  66,  67,  70,  71,  72],
-       [ 62,  63,  64,  67,  68,  69,  72,  73,  74],
-       [ 75,  76,  77,  80,  81,  82,  85,  86,  87],
-       [ 77,  78,  79,  82,  83,  84,  87,  88,  89],
-       [ 85,  86,  87,  90,  91,  92,  95,  96,  97],
-       [ 87,  88,  89,  92,  93,  94,  97,  98,  99],
-       [100, 101, 102, 105, 106, 107, 110, 111, 112],
-       [102, 103, 104, 107, 108, 109, 112, 113, 114],
-       [110, 111, 112, 115, 116, 117, 120, 121, 122],
-       [112, 113, 114, 117, 118, 119, 122, 123, 124],
-       [125, 126, 127, 130, 131, 132, 135, 136, 137],
-       [127, 128, 129, 132, 133, 134, 137, 138, 139],
-       [135, 136, 137, 140, 141, 142, 145, 146, 147],
-       [137, 138, 139, 142, 143, 144, 147, 148, 149]])
-        #g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_without_gpu)
-        #print g()
-        #assert numpy.allclose(images.get_value(borrow=True),g())
-def test_neibs_wrap_centered_step_manual():
-    modes = [mode_without_gpu]
-    if cuda.cuda_available:
-        modes.append(mode_with_gpu)
-    expected1 = [[24, 20, 21,  4,  0,  1,  9,  5,  6],
-                 [21, 22, 23,  1,  2,  3,  6,  7,  8],
-                 [23, 24, 20,  3,  4,  0,  8,  9,  5],
-                 [ 9,  5,  6, 14, 10, 11, 19, 15, 16],
-                 [ 6,  7,  8, 11, 12, 13, 16, 17, 18],
-                 [ 8,  9,  5, 13, 14, 10, 18, 19, 15],
-                 [19, 15, 16, 24, 20, 21,  4,  0,  1],
-                 [16, 17, 18, 21, 22, 23,  1,  2,  3],
-                 [18, 19, 15, 23, 24, 20,  3,  4,  0]]
-    expected2 = [[ 24,  20,  21,   4,   0,   1,   9,   5,   6],
-                 [ 22,  23,  24,   2,   3,   4,   7,   8,   9],
-                 [ 14,  10,  11,  19,  15,  16,  24,  20,  21],
-                 [ 12,  13,  14,  17,  18,  19,  22,  23,  24]]
-    expected3 = [[19, 15, 16, 24, 20, 21,  4,  0,  1,  9,  5,  6, 14, 10, 11],
-                 [17, 18, 19, 22, 23, 24,  2,  3,  4,  7,  8,  9, 12, 13, 14],
-                 [ 9,  5,  6, 14, 10, 11, 19, 15, 16, 24, 20, 21,  4,  0,  1],
-                 [ 7,  8,  9, 12, 13, 14, 17, 18, 19, 22, 23, 24,  2,  3,  4]]
-    expected4 = [[23, 24, 20, 21, 22,  3,  4,  0,  1,  2,  8,  9,  5,  6,  7],
-                 [21, 22, 23, 24, 20,  1,  2,  3,  4,  0,  6,  7,  8,  9,  5],
-                 [13, 14, 10, 11, 12, 18, 19, 15, 16, 17, 23, 24, 20, 21, 22],
-                 [11, 12, 13, 14, 10, 16, 17, 18, 19, 15, 21, 22, 23, 24, 20]]
-    expected5 = [[24, 20, 21,  4,  0,  1,  9,  5,  6],
-                 [22, 23, 24,  2,  3,  4,  7,  8,  9],
-                 [ 9,  5,  6, 14, 10, 11, 19, 15, 16],
-                 [ 7,  8,  9, 12, 13, 14, 17, 18, 19],
-                 [19, 15, 16, 24, 20, 21,  4,  0,  1],
-                 [17, 18, 19, 22, 23, 24,  2,  3,  4]]
-    expected6 = [[24, 20, 21,  4,  0,  1,  9,  5,  6],
-                 [21, 22, 23,  1,  2,  3,  6,  7,  8],
-                 [23, 24, 20,  3,  4,  0,  8,  9,  5],
-                 [14, 10, 11, 19, 15, 16, 24, 20, 21],
-                 [11, 12, 13, 16, 17, 18, 21, 22, 23],
-                 [13, 14, 10, 18, 19, 15, 23, 24, 20]]
-    #TODO test discontinous image
-    for shp_idx, (shape, neib_shape, neib_step, expected) in enumerate([
-        [(7, 8, 5, 5), (3, 3), (2, 2), expected1],
-        [(7, 8, 5, 5), (3, 3), (3, 3), expected2],
-        [(7, 8, 5, 5), (5, 3), (3, 3), expected3],
-        [(7, 8, 5, 5), (3, 5), (3, 3), expected4],
-        [(80, 90, 5, 5), (3, 3), (2, 3), expected5],
-        [(1025, 9, 5, 5), (3, 3), (3, 2), expected6],
-        [(1, 1, 5, 1035), (3, 3), (3, 3), None],
-        [(1, 1, 1045, 5), (3, 3), (3, 3), None],
-        ]):
-        images = shared(numpy.asarray(numpy.arange(numpy.prod(
+        # Test a valid shapes
-                        shape)).reshape(shape), dtype='float32'))
+        shape = (2, 3, 3, 3)
-        neib_shape = T.as_tensor_variable(neib_shape)
+        images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
-        neib_step = T.as_tensor_variable(neib_step)
+        neib_shape = T.as_tensor_variable((3, 3))
-        expected = numpy.asarray(expected)
-        for mode_idx, mode in enumerate(modes):
+        f = function([], images2neibs(images, neib_shape, mode="wrap_centered"),
-            f = function([], images2neibs(images, neib_shape, neib_step,
+                     mode=self.mode)
-                                          mode="wrap_centered"), mode=mode)
+        f()
-            neibs = f()
-            if expected.size > 1:
+    def test_grad_wrap_centered(self):
-                for i in range(shape[0] * shape[1]):
+        # It is not implemented for now. So test that we raise an error.
-                    assert numpy.allclose(neibs[i * expected.shape[0]:
+        shape = (2, 3, 6, 6)
-                                              (i + 1) * expected.shape[0], :],
+        images_val = numpy.random.rand(*shape).astype('float32')
-                                          expected + 25 * i), mode_idx
-            if mode_idx == 0:
+        def fn(images):
-                assert Images2Neibs in [type(node.op)
+            return images2neibs(images, (3, 3), mode='wrap_centered')
-                                        for node in f.maker.fgraph.toposort()]
-            elif mode_idx == 1:
-                assert GpuImages2Neibs in [type(node.op)
-                                           for node in f.maker.fgraph.toposort()]
-            #g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_without_gpu)
+        self.assertRaises(NotImplementedError, unittest_tools.verify_grad,
+                          fn, [images_val], mode=self.mode)
-            #assert numpy.allclose(images.get_value(borrow=True), g())
+    def test_grad_valid(self):
+        shape = (2, 3, 4, 4)
+        images_val = numpy.random.rand(*shape).astype('float32')
+        def fn(images):
+            return images2neibs(images, (2, 2))
+        unittest_tools.verify_grad(fn, [images_val], mode=self.mode,
+                                   eps=0.1)
+        # The grad is not defined when the neib_shape and neib_step
+        # are not the same.
+        def fn(images):
+            return images2neibs(images, (2, 2), (1, 1))
+        self.assertRaises(NotImplementedError,
+                          unittest_tools.verify_grad, fn, [images_val],
+                          mode=self.mode)
+    def test_grad_ignore_border(self):
+        shape = (2, 3, 5, 5)
+        images = T.dtensor4()
+        images_val = numpy.random.rand(*shape).astype('float32')
-def test_neibs_gpu():
+        def fn(images):
-    if cuda.cuda_available == False:
+            return images2neibs(images, (2, 2),
-        raise SkipTest('Optional package cuda disabled')
+                                mode='ignore_borders')
-    for shape, pshape in [((100, 40, 18, 18), (2, 2)),
-                          ((100, 40, 6, 18), (3, 2)),
-                          ((10, 40, 66, 66), (33, 33)),
-                          ((10, 40, 68, 66), (34, 33))
-                          ]:
+        unittest_tools.verify_grad(fn, [images_val], mode=self.mode,
+                                   eps=0.1)
+    def test_neibs2images_grad(self):
+        # say we had images of size (2, 3, 20, 20)
+        # then we extracted 2x2 neighbors on this, we get (2 * 3 * 10 * 10, 4)
+        neibs = T.dmatrix()
+        neibs_val = numpy.random.rand(600, 4)
+        def fn(neibs):
+            return neibs2images(neibs, (2, 2), (2, 3, 20, 20))
+        unittest_tools.verify_grad(fn, [neibs_val], mode=self.mode,
+                                   eps=0.1)
+    def test_neibs_valid_with_inconsistent_borders(self):
+        shape = (2, 3, 5, 5)
+        images = T.dtensor4()
+        images_val = numpy.arange(numpy.prod(shape),
+                                  dtype='float32').reshape(shape)
+        def fn(images):
+            return T.sum(T.sqr(images2neibs(images, (2, 2), mode='valid')),
+                         axis=[0, 1])
+        f = theano.function([images],
+                            T.sqr(images2neibs(images, (2, 2), mode='valid')),
+                            mode=self.mode)
+        self.assertRaises(TypeError, f, images_val)
+    def speed_neibs(self):
+        shape = (100, 40, 18, 18)
        images = shared(numpy.arange(numpy.prod(shape),
                                     dtype='float32').reshape(shape))
-        neib_shape = T.as_tensor_variable(pshape)
+        neib_shape = T.as_tensor_variable((3, 3))
        f = function([], images2neibs(images, neib_shape),
-                     mode=mode_with_gpu)
+                     mode=self.mode)
-        f_gpu = function([], images2neibs(images, neib_shape),
-                     mode=mode_with_gpu)
-        assert any([isinstance(node.op, GpuImages2Neibs)
-                    for node in f_gpu.maker.fgraph.toposort()])
-        #print images.get_value(borrow=True)
-        neibs = numpy.asarray(f_gpu())
-        assert numpy.allclose(neibs, f())
-        #print neibs
-        g = function([], neibs2images(neibs, neib_shape, images.shape),
-                     mode=mode_with_gpu)
-        assert any([isinstance(node.op, GpuImages2Neibs)
-                    for node in f.maker.fgraph.toposort()])
-        #print numpy.asarray(g())
-        assert numpy.allclose(images.get_value(borrow=True), g())
-def speed_neibs():
-    shape = (100, 40, 18, 18)
-    images = shared(numpy.arange(numpy.prod(shape),
-                                 dtype='float32').reshape(shape))
-    neib_shape = T.as_tensor_variable((3, 3))
-    f = function([], images2neibs(images, neib_shape))
-    for i in range(1000):
-        f()
+        for i in range(1000):
+            f()
-def speed_neibs_wrap_centered():
+    def speed_neibs_wrap_centered(self):
-    shape = (100, 40, 18, 18)
+        shape = (100, 40, 18, 18)
-    images = shared(numpy.arange(numpy.prod(shape),
+        images = shared(numpy.arange(numpy.prod(shape),
-                                 dtype='float32').reshape(shape))
+                                     dtype='float32').reshape(shape))
-    neib_shape = T.as_tensor_variable((3, 3))
+        neib_shape = T.as_tensor_variable((3, 3))
-    f = function([], images2neibs(images, neib_shape, mode="wrap_centered"))
+        f = function([],
+                     images2neibs(images, neib_shape, mode="wrap_centered"),
+                     mode=self.mode)
-    for i in range(1000):
+        for i in range(1000):
-        f()
+            f()
-# Disable the test as the grad is wrongly implemented
-def tes_neibs_grad_verify_grad():
-    shape = (2, 3, 4, 4)
-    images = T.dtensor4()
-    images_val = numpy.arange(numpy.prod(shape),
-                              dtype='float32').reshape(shape)
-    def fn(images):
-        return T.sum(T.sqr(images2neibs(images, (2, 2))), axis=[0, 1])
-    unittest_tools.verify_grad(fn, [images_val], mode=mode_without_gpu)
-    if cuda.cuda_available:
-        unittest_tools.verify_grad(fn, [images_val], mode=mode_with_gpu)
-def test_neibs_grad_verify_grad_warp_centered():
-    # It is not implemented for now. So test that we raise an error.
-    shape = (2, 3, 6, 6)
-    images = T.dtensor4()
-    images_val = numpy.arange(numpy.prod(shape),
-                              dtype='float32').reshape(shape)
-    def fn(images):
-        return T.sum(T.sqr(images2neibs(images, (3, 3), mode='wrap_centered')),
-                     axis=[0, 1])
-    try:
-        unittest_tools.verify_grad(fn, [images_val], mode=mode_without_gpu)
-        raise Exception("Expected an error")
-    except NotImplementedError:
-        pass
-    if cuda.cuda_available:
-        try:
-            unittest_tools.verify_grad(fn, [images_val], mode=mode_with_gpu)
-            raise Exception("Expected an error")
-        except NotImplementedError:
-            pass
-def test_neibs_ignore_border():
-    shape = (2, 3, 5, 5)
-    images = T.dtensor4()
-    images_val = numpy.arange(numpy.prod(shape),
-                              dtype='float32').reshape(shape)
-    def fn(images):
-        return T.sum(T.sqr(images2neibs(images, (2, 2),
-                                        mode='ignore_borders')), axis=[0, 1])
-    # Disable the test as the grad is wrongly implemented
-    #unittest_tools.verify_grad(fn, [images_val], mode=mode_without_gpu)
-#    not implemented for gpu
-#    if cuda.cuda_available:
-#        unittest_tools.verify_grad(fn, [images_val], mode=mode_with_gpu)
-def test_neibs_valid_with_inconsistent_borders():
-    shape = (2, 3, 5, 5)
-    images = T.dtensor4()
-    images_val = numpy.arange(numpy.prod(shape),
-                              dtype='float32').reshape(shape)
-    def fn(images):
-        return T.sum(T.sqr(images2neibs(images, (2, 2), mode='valid')),
-                     axis=[0, 1])
-    f = theano.function([images],
-                        T.sqr(images2neibs(images, (2, 2), mode='valid')),
-                        mode=mode_without_gpu)
-    try:
-        f(images_val)
-        assert False, "An error was expected"
-    except TypeError, e:
-        # This is expected if the assert is there
-        pass
-# Disable the test as the grad is wrongly implemented
-def tes_neibs2images_crash_on_grad():
-    # say we had images of size (2, 3, 20, 20)
-    # then we extracted 2x2 neighbors on this, we get (2 * 3 * 10 * 10, 4)
-    neibs = T.dmatrix()
-    neibs_val = numpy.random.rand(600, 4)
-    to_images = T.sum(neibs2images(neibs, (2, 2), (2, 3, 20, 20)))
-    g = T.grad(to_images, neibs)
-    fn = theano.function([neibs], to_images, mode=mode_without_gpu)
-    #print "Compiled"
-    fn(neibs_val)
 if __name__ == '__main__':
-    #test_neibs_gpu()
+    unittest.main()
-    #test_neibs()
-    #test_neibs_grad_verify_grad()
-    test_neibs2images_crash_on_grad()
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -5391,11 +5391,13 @@ class Reshape(Op):
            requ = list(requ.data)
            requ_part = [ele for ele in requ if ele != -1]
            crit = len(requ) - len(requ_part)
-            if crit == 1:
+            if crit == 1 and len(requ_part) > 0:
-                missing = numpy.prod(ishapes[0]) / numpy.prod(requ_part)
+                missing = mul(*ishapes[0]) / mul(*requ_part)
                for i, ele in enumerate(requ):
                    if ele == -1:
                        requ[i] = missing
+            elif crit == 1:  # we reshape to -1
+                requ = [mul(*ishapes[0])]
            elif crit > 1:
                raise ValueError('shape argument to Reshape.perform'
                    ' must have at most one entry equal to -1')

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -16,25 +16,26 @@ import numpy
 import theano
 from theano.tensor import (as_tensor_variable, blas, get_constant_value,
-        patternbroadcast)
+                           patternbroadcast)
-from theano import Op, config
+from theano import OpenMPOp, config
 from theano.gof import Apply
 from theano.gof.python25 import any
 imported_scipy_signal = False
 try:
-    # TODO: move these back out to global scope when they no longer cause an atexit error
+    # TODO: move these back out to global scope when they no longer
-    from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
+    # cause an atexit error
+    from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
    from scipy.signal.sigtools import _convolve2d
    imported_scipy_signal = True
 except ImportError:
    pass
-_logger=logging.getLogger("theano.tensor.nnet.conv")
+_logger = logging.getLogger("theano.tensor.nnet.conv")
 def conv2d(input, filters, image_shape=None, filter_shape=None,
-                border_mode='valid', subsample=(1,1), **kargs):
+           border_mode='valid', subsample=(1, 1), **kargs):
    """This function will build the symbolic graph for convolving a stack of input
    images with a set of filters. The implementation is modelled after
    Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
@@ -62,8 +63,10 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
    :param filter_shape: (nb filters, stack size, nb row, nb col)
                         Optional, used for optimization.
-    :param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
+    :param kwargs: kwargs are passed onto ConvOp.
-                   unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc)
+                   Can be used to set the following:
+                   unroll_batch, unroll_kern, unroll_patch,
+                   openmp (see ConvOp doc)
                   openmp: By default have the same value as
                           config.openmp. For small image, filter,
@@ -77,8 +80,8 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                           with openmp on a core 2 duo.
    :rtype: symbolic 4D tensor
-    :return: set of feature maps generated by convolutional layer. Tensor is of shape
+    :return: set of feature maps generated by convolutional layer. Tensor is
-      (batch size, nb filters, output row, output col)
+        of shape (batch size, nb filters, output row, output col)
    """
@@ -87,20 +90,22 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
        image_shape = list(image_shape)
        for i in xrange(len(image_shape)):
            if image_shape[i] is not None:
-                image_shape[i] = get_constant_value(as_tensor_variable(image_shape[i]))
+                image_shape[i] = get_constant_value(
+                    as_tensor_variable(image_shape[i]))
                assert str(image_shape[i].dtype).startswith('int')
                image_shape[i] = int(image_shape[i])
    if filter_shape is not None:
        filter_shape = list(filter_shape)
        for i in xrange(len(filter_shape)):
            if filter_shape[i] is not None:
-                filter_shape[i] = get_constant_value(as_tensor_variable(filter_shape[i]))
+                filter_shape[i] = get_constant_value(
+                    as_tensor_variable(filter_shape[i]))
                assert str(filter_shape[i].dtype).startswith('int')
                filter_shape[i] = int(filter_shape[i])
    if image_shape and filter_shape:
        try:
-            assert image_shape[1]==filter_shape[1]
+            assert image_shape[1] == filter_shape[1]
        except Exception:
            print 'image ', image_shape, ' filters ', filter_shape
            raise
@@ -118,12 +123,12 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
        bsize, imshp = None, None
    op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
-                imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
+                imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize, **kargs)
    return op(input, filters)
-class ConvOp(Op):
+class ConvOp(OpenMPOp):
    """
    This Op serves a dual purpose: it can implement a vanilla 2D convolution
    (as taught in any signal processing class) or implement the
@@ -141,13 +146,13 @@ class ConvOp(Op):
    The output of ConvOp is a 4D tensor, generated as follows:
        output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
-    where b is the mini-batch index, k the filter index and * is the convolution
+    where b is the mini-batch index, k the filter index and * is the
-    operator.
+    convolution operator.
    """
    __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
-            'unroll_batch', 'unroll_kern', 'unroll_patch',
+                   'unroll_batch', 'unroll_kern', 'unroll_patch',
-            'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
+                   'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
    """These attributes uniquely identify the behaviour of this op for
    given inputs. Do not set openmp here.
    """
@@ -161,63 +166,63 @@ class ConvOp(Op):
 #      using the real shape and the same dtype could also help.
 #unroll_batch, unroll_kern, valid time, full time
-    speed_unroll_batch_kern=[(1, 1, 2.4661250114440918, 6.5472931861877441) ,
+    speed_unroll_batch_kern = [(1, 1, 2.4661250114440918, 6.5472931861877441),
-(1, 2, 1.5869178771972656, 5.1499760150909424) ,
+                               (1, 2, 1.5869178771972656, 5.1499760150909424),
-(1, 3, 1.4270510673522949, 3.6593470573425293) ,
+                               (1, 3, 1.4270510673522949, 3.6593470573425293),
-(1, 4, 1.3373479843139648, 3.3451821804046631) ,
+                               (1, 4, 1.3373479843139648, 3.3451821804046631),
-(1, 5, 1.2818830013275146, 3.1444568634033203) ,
+                               (1, 5, 1.2818830013275146, 3.1444568634033203),
-(1, 6, 1.2521560192108154, 3.0256359577178955) ,
+                               (1, 6, 1.2521560192108154, 3.0256359577178955),
-(1, 10, 1.2134110927581787, 2.9174180030822754) ,
+                               (1, 10, 1.2134110927581787, 2.9174180030822754),
-(2, 1, 1.657214879989624, 4.5261678695678711) ,
+                               (2, 1, 1.657214879989624, 4.5261678695678711),
-(2, 2, 1.2123160362243652, 2.9747390747070312) ,
+                               (2, 2, 1.2123160362243652, 2.9747390747070312),
-(2, 3, 1.0758891105651855, 2.5690360069274902) ,
+                               (2, 3, 1.0758891105651855, 2.5690360069274902),
-(2, 4, 1.0683329105377197, 2.4233770370483398) ,
+                               (2, 4, 1.0683329105377197, 2.4233770370483398),
-(2, 5, 1.0955719947814941, 2.3999948501586914) ,
+                               (2, 5, 1.0955719947814941, 2.3999948501586914),
-(2, 6, 1.5935721397399902, 2.6878271102905273) ,
+                               (2, 6, 1.5935721397399902, 2.6878271102905273),
-(2, 10, 1.8511250019073486, 3.2417428493499756) ,
+                               (2, 10, 1.8511250019073486, 3.2417428493499756),
-(3, 1, 1.5948119163513184, 3.631148099899292) ,
+                               (3, 1, 1.5948119163513184, 3.631148099899292),
-(3, 2, 1.0761330127716064, 2.6011371612548828) ,
+                               (3, 2, 1.0761330127716064, 2.6011371612548828),
-(3, 3, 1.0551531314849854, 2.4200370311737061) ,
+                               (3, 3, 1.0551531314849854, 2.4200370311737061),
-(3, 4, 1.3930759429931641, 2.5211219787597656) ,
+                               (3, 4, 1.3930759429931641, 2.5211219787597656),
-(3, 5, 1.4330689907073975, 2.5704989433288574) ,
+                               (3, 5, 1.4330689907073975, 2.5704989433288574),
-(3, 6, 1.362138032913208, 2.5964410305023193) ,
+                               (3, 6, 1.362138032913208, 2.5964410305023193),
-(3, 10, 1.6582000255584717, 2.9907989501953125) ,
+                               (3, 10, 1.6582000255584717, 2.9907989501953125),
-(4, 1, 1.4793620109558105, 3.3473429679870605) ,
+                               (4, 1, 1.4793620109558105, 3.3473429679870605),
-(4, 2, 1.0671560764312744, 2.4171769618988037) ,
+                               (4, 2, 1.0671560764312744, 2.4171769618988037),
-(4, 3, 1.2569692134857178, 2.2807950973510742) ,
+                               (4, 3, 1.2569692134857178, 2.2807950973510742),
-(4, 4, 1.3456289768218994, 2.6219108104705811) ,
+                               (4, 4, 1.3456289768218994, 2.6219108104705811),
-(4, 5, 1.4055080413818359, 2.4606490135192871) ,
+                               (4, 5, 1.4055080413818359, 2.4606490135192871),
-(4, 6, 1.372107982635498, 2.551663875579834) ,
+                               (4, 6, 1.372107982635498, 2.551663875579834),
-(4, 10, 1.599470853805542, 2.9172940254211426) ,
+                               (4, 10, 1.599470853805542, 2.9172940254211426),
-(5, 1, 1.4115700721740723, 3.2077109813690186) ,
+                               (5, 1, 1.4115700721740723, 3.2077109813690186),
-(5, 2, 1.0635769367218018, 2.2648060321807861) ,
+                               (5, 2, 1.0635769367218018, 2.2648060321807861),
-(5, 3, 1.3842809200286865, 2.6135518550872803) ,
+                               (5, 3, 1.3842809200286865, 2.6135518550872803),
-(5, 4, 1.3470511436462402, 2.3852400779724121) ,
+                               (5, 4, 1.3470511436462402, 2.3852400779724121),
-(5, 5, 1.3539440631866455, 2.5245928764343262) ,
+                               (5, 5, 1.3539440631866455, 2.5245928764343262),
-(5, 6, 1.4037849903106689, 2.5985310077667236) ,
+                               (5, 6, 1.4037849903106689, 2.5985310077667236),
-(5, 10, 1.6120610237121582, 2.8127608299255371) ,
+                               (5, 10, 1.6120610237121582, 2.8127608299255371),
-(6, 1, 1.3623628616333008, 3.021122932434082) ,
+                               (6, 1, 1.3623628616333008, 3.021122932434082),
-(6, 2, 1.1697649955749512, 2.6285450458526611) ,
+                               (6, 2, 1.1697649955749512, 2.6285450458526611),
-(6, 3, 1.2980999946594238, 2.4746189117431641) ,
+                               (6, 3, 1.2980999946594238, 2.4746189117431641),
-(6, 4, 1.3739941120147705, 2.5579929351806641) ,
+                               (6, 4, 1.3739941120147705, 2.5579929351806641),
-(6, 5, 1.3967819213867188, 2.5522029399871826) ,
+                               (6, 5, 1.3967819213867188, 2.5522029399871826),
-(6, 6, 1.4279270172119141, 2.6127138137817383) ,
+                               (6, 6, 1.4279270172119141, 2.6127138137817383),
-(6, 10, 1.605496883392334, 2.864037036895752) ,
+                               (6, 10, 1.605496883392334, 2.864037036895752),
-(10, 1, 1.6401121616363525, 2.970099925994873) ,
+                               (10, 1, 1.6401121616363525, 2.970099925994873),
-(10, 2, 1.46710205078125, 2.7231831550598145) ,
+                               (10, 2, 1.46710205078125, 2.7231831550598145),
-(10, 3, 1.4193780422210693, 2.6087639331817627) ,
+                               (10, 3, 1.4193780422210693, 2.6087639331817627),
-(10, 4, 1.4657118320465088, 2.6246678829193115) ,
+                               (10, 4, 1.4657118320465088, 2.6246678829193115),
-(10, 5, 1.5052611827850342, 2.6542458534240723) ,
+                               (10, 5, 1.5052611827850342, 2.6542458534240723),
-(10, 6, 1.5214400291442871, 2.7243161201477051) ,
+                               (10, 6, 1.5214400291442871, 2.7243161201477051),
-(10, 10, 1.6116268634796143, 2.956165075302124)]
+                               (10, 10, 1.6116268634796143, 2.956165075302124)]
    #valid time, full time
-    speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828]
+    speed_unroll_patch_noshape = [2.0109100341796875, 5.8175678253173828]
    #valid time, full time
-    speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812]
+    speed_unroll_patch_shape = [1.2967290878295898, 5.5283889770507812]
    @staticmethod
-    def getOutputShape(inshp, kshp, stride=(1,1), mode='valid'):
+    def getOutputShape(inshp, kshp, stride=(1, 1), mode='valid'):
        """
        Computes the output dimensions of convolving an image of shape "inshp"
        with kernels of shape "kshp".
@@ -228,26 +233,27 @@ class ConvOp(Op):
        :return: (rows,cols) of output image
        """
        dx, dy = stride
-        if mode=='valid': s = -1
+        if mode == 'valid':
-        else: s = 1
+            s = -1
+        else:
+            s = 1
        inshp, kshp = numpy.array(inshp), numpy.array(kshp)
-        return  numpy.int64(numpy.ceil((inshp + s*kshp - s*1)/\
+        return  numpy.int64(numpy.ceil((inshp + s * kshp - s * 1) /
-                numpy.array([dx,dy], dtype='float')))
+                                       numpy.array([dx, dy], dtype='float')))
    def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
-            dx=1, dy=1,
+                 dx=1, dy=1,
-            output_mode='valid',
+                 output_mode='valid',
-            unroll_batch=None,
+                 unroll_batch=None,
-            unroll_kern=None,
+                 unroll_kern=None,
-            unroll_patch=None,
+                 unroll_patch=None,
-            imshp_logical=None,
+                 imshp_logical=None,
-            kshp_logical=None,
+                 kshp_logical=None,
-            kshp_logical_top_aligned=True,
+                 kshp_logical_top_aligned=True,
-            verbose=0,
+                 verbose=0,
-            version=-1,
+                 version=-1,
-            openmp=None):
+                 openmp=None):
        """
        Initializes a ConvOp with given output_mode (full/valid). All other
        parameters are optional and are only used to generate more optimized c
@@ -259,12 +265,13 @@ class ConvOp(Op):
        By default we try to select the fastest version. You can specify it
        with the unroll_batch, unroll_kern, and unroll_patch parameter.
-        The second type of optimization is hardcoding some dimensions into the code
+        The second type of optimization is hardcoding some dimensions into the
-        when all shape are know.
+        code when all shape are know.
        This make a significant difference for the 'full' output_mode.
-        Some times, the fastest implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
+        Some times, the fastest implementation on x86-64 uses
-        unroll_patch=False} with all other shape parameters being provided.
+        {unroll_batch=4, unroll_kern=4, unroll_patch=False}
+        with all other shape parameters being provided.
        For optimizing other architectures, see:
        Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
@@ -278,7 +285,8 @@ class ConvOp(Op):
        Optional parameters: (will generate more optimal c code)
-        :type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
+        :type imshp: tuple of len 2 or 3: 2 for 2d image,
+                                          3 for a stack of 2d images.
        :param imshp: (stacksize, nb image row, nb image col)
        :type kshp: tuple of len 2
        :param kshp: (nb kernel row, nb kernel col)
@@ -294,16 +302,18 @@ class ConvOp(Op):
        Params which select the version of code used:
        :type unroll_patch: bool
-        :param unroll_patch: use a version of c_code that unroll the patch loop that don't
+        :param unroll_patch: use a version of c_code that unroll the patch loop
-        request all shape information to work, but if all shape information are present, will
+            that don't request all shape information to work, but if all shape
+            information are present, will
        use it to hardcode the value in the code for faster code.
        :type unroll_batch:int
-        :param unroll_batch: use a version of c_code that unroll the batch(by unroll_batch) and
+        :param unroll_batch: use a version of c_code that unroll the batch
-        the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
+            (by unroll_batch) and the nkern(by unroll_kern) loop. The size
-        respectively.
+            must by a multiple of bsize or nkern respectively.
        :type unroll_kern:int
-        :param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and
+        :param unroll_kern: use a version of c_code that unroll the batch
-        the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
+            (by unroll_batch) and the nkern(by unroll_kern) loop. The size
+            must by a multiple of bsize or nkern
        respectively.
        :type verbose: int
@@ -311,13 +321,27 @@ class ConvOp(Op):
        :type version: int
        :param version: passed to GpuConv
-        :param imshp_logical: used internally when we generate the gradient when dx!=1 or dy!=1
+        The 3 following parameters are used internally when we generate
-        :param kshp_logical: idem
+        the gradient when dx!=1 or dy!=1.
-        :param kshp_logical_top_aligned: idem
+        :param imshp_logical: Default None. None value is equivalent to imshp
+            value. When imshp_logical != imshp, it tell we need to insert 0 in
+            the image before we do the convolution. For example, when dx==dy==2
+            and the image is [[1, 2], [3, 4]], we should make as if the image
+            was [[1, 0, 2, 0], [0, 0, 0, 0], [3, 0, 4, 0], [0, 0, 0, 0]].
+            Our python code insert the zero, but the c code optimize it.
+            imshp_logical != imshp when taking the grad again the weights or
+            the image when the output_mode is full and `dx != 1` or `dy != 1`.
+        :param kshp_logical: idem but for kshp and used for the grad again the
+            weights when the output_mode is valid and `dx != 1` or `dy != 1`.
+        :param kshp_logical_top_aligned: Used in the same case.Default to True.
+            Set to False in the grad again the weight when the
+            output_mode is full.
        """
        # We must continue to consider None as 1 for backward compatibility.
-        if dx is None: dx = 1
+        if dx is None:
-        if dy is None: dy = 1
+            dx = 1
+        if dy is None:
+            dy = 1
        if  int(dx) != dx:
            raise TypeError('ConvOp.__init__ param dx must be an int', dx)
@@ -330,22 +354,23 @@ class ConvOp(Op):
        all_shape = imshp is not None and kshp is not None and \
                    nkern is not None and bsize is not None
-        if (unroll_batch>0 or unroll_kern>0) and not all_shape:
+        if (unroll_batch > 0 or unroll_kern > 0) and not all_shape:
-            raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")
+            raise Exception("In ConvOp, when using unroll_batch and"
+                            " unroll_nkern, all shape are needed")
-        if openmp is None:
+        #Init the openmp attribute
-            openmp = theano.config.openmp
+        super(ConvOp, self).__init__(openmp=openmp)
-        if not all_shape or config.openmp:
+        if not all_shape or self.openmp:
            # Only this version is parallelized
            unroll_patch = True
        if imshp is not None:
            imshp = tuple(imshp)
-            if len(imshp)==2:
+            if len(imshp) == 2:
-                imshp = (1,)+imshp
+                imshp = (1,) + imshp
-            elif len(imshp)==3:
+            elif len(imshp) == 3:
                imshp = imshp
            else:
                raise Exception("bad len for imshp")
@@ -356,73 +381,80 @@ class ConvOp(Op):
        self.kshp = kshp
        self.nkern = nkern
-        self.bsize=bsize
+        self.bsize = bsize
-        self.dx=dx
+        self.dx = dx
-        self.dy=dy
+        self.dy = dy
-        self.verbose=verbose
+        self.verbose = verbose
-        self.version=version
+        self.version = version
-        if openmp is None:
-            openmp = config.openmp
-        self.openmp = openmp
        # a triple
        self.imshp_logical = self.imshp
-        if imshp_logical is not None: self.imshp_logical = tuple(imshp_logical)
+        if imshp_logical is not None:
-        assert (self.imshp is None and self.imshp_logical is None) or \
+            self.imshp_logical = tuple(imshp_logical)
-               (len(self.imshp) == len(self.imshp_logical))
+        assert ((self.imshp is None and self.imshp_logical is None) or
+                (len(self.imshp) == len(self.imshp_logical)))
        # a pair
        self.kshp_logical = self.kshp
-        if kshp_logical is not None: self.kshp_logical = tuple(kshp_logical)
+        if kshp_logical is not None:
+            self.kshp_logical = tuple(kshp_logical)
        self.kshp_logical_top_aligned = kshp_logical_top_aligned
-        self.unroll_batch=unroll_batch
+        self.unroll_batch = unroll_batch
-        self.unroll_kern=unroll_kern
+        self.unroll_kern = unroll_kern
-        self.unroll_patch=unroll_patch
+        self.unroll_patch = unroll_patch
-        if self.unroll_batch and not self.unroll_kern: self.unroll_kern = 1
+        if self.unroll_batch and not self.unroll_kern:
-        if self.unroll_kern and not self.unroll_batch: self.unroll_batch = 1
+            self.unroll_kern = 1
+        if self.unroll_kern and not self.unroll_batch:
+            self.unroll_batch = 1
        #downcast unroll_batch if not a divisor of batch size
-        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
+        if self.unroll_batch > 0 and self.bsize % self.unroll_batch != 0:
-            if self.bsize<=self.unroll_batch:
+            if self.bsize <= self.unroll_batch:
                self.unroll_batch = self.bsize
            else:
                #find the maximum value under unroll_batch that would work
-                new=self.unroll_batch
+                new = self.unroll_batch
-                assert(new>=1)
+                assert(new >= 1)
-                while self.bsize % new!=0:
+                while self.bsize % new != 0:
-                    new-=1
+                    new -= 1
-                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
+                warnstr = ("OPTIMISATION WARNING: in ConvOp.__init__() "
-                      "must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
+                           "unroll_batch(%i) must be 0 or a divisor of"
-                      " won't change the result, but may make it slower."
+                           " bsize(%i). We revert it to %i. This"
+                           " won't change the result, but may make it slower.")
                _logger.warn(warnstr, self.unroll_batch, self.bsize, new)
-                self.unroll_batch=new
+                self.unroll_batch = new
        #downcast unroll_kern if not a divisor of nb of kernel
-        if self.unroll_kern>0 and self.nkern % self.unroll_kern!=0:
+        if self.unroll_kern > 0 and self.nkern % self.unroll_kern != 0:
-            if self.nkern<=self.unroll_kern:
+            if self.nkern <= self.unroll_kern:
                self.unroll_kern = self.nkern
            else:
                #find the maximum value under unroll_kern that would work
-                new=self.unroll_kern
+                new = self.unroll_kern
-                assert(new>=1)
+                assert(new >= 1)
-                while self.nkern % new!=0:
+                while self.nkern % new != 0:
-                    new-=1
+                    new -= 1
-                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\
+                warnstr = ("OPTIMISATION WARNING: in ConvOp.__init__()"
-                      "should be 0 or a divisor of nkern(%i). We revert it to %i."\
+                           " unroll_kern(%i) should be 0 or a divisor of"
-                      "This won't change the result, but may make it slower."
+                           " nkern(%i). We revert it to %i. This"
+                           " won't change the result, but may make it slower.")
                _logger.warn(warnstr, self.unroll_kern, self.nkern, new)
-                self.unroll_kern=new
+                self.unroll_kern = new
        if all_shape:
-            self.outshp     = ConvOp.getOutputShape(self.imshp_logical[1:], self.kshp_logical, (dx,dy), output_mode)
+            self.outshp = ConvOp.getOutputShape(self.imshp_logical[1:],
-            self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:], self.kshp_logical, (1,1), output_mode)
+                                                self.kshp_logical, (dx, dy),
+                                                output_mode)
+            self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical[1:],
+                                                    self.kshp_logical, (1, 1),
+                                                    output_mode)
        else:
            self.outshp = None
            self.fulloutshp = None
@@ -430,51 +462,59 @@ class ConvOp(Op):
        self.out_mode = output_mode
        if not self.out_mode in ["valid", "full"]:
-            raise Exception("Mode %s not implemented"%self.out_mode)
+            raise Exception("Mode %s not implemented" % self.out_mode)
        if all_shape and not (self.outshp > 0).all():
-            raise Exception(("Bad size for the output shape. Verify that [post-"\
+            raise Exception("Bad size for the output shape. Verify that [post-"
-                    "supersampling] input shape (%s) and kern shape(%s) are ok. "\
+                            "supersampling] input shape (%s) and kern"
-                    "(Hint: kerns must fit inside image in valid mode)")%
+                            " shape(%s) are ok. (Hint: kerns must fit inside"
-                    (self.imshp_logical,self.kshp_logical))
+                            " image in valid mode)" %
+                            (self.imshp_logical, self.kshp_logical))
+        if (self.unroll_kern is None and
+            self.unroll_batch is None and
+            self.unroll_patch is None):
-        if self.unroll_kern is None and self.unroll_batch is None and self.unroll_patch is None:
            #no version specified. Find the faster we have
            if self.bsize is None and self.nkern is None:
                self.unroll_patch = True
            elif self.bsize is not None and self.nkern is not None:
-                bsize=self.bsize
+                bsize = self.bsize
-                nkern=self.nkern
+                nkern = self.nkern
                if bsize is None:
-                    bsize=1
+                    bsize = 1
                if nkern is None:
-                    nkern=1
+                    nkern = 1
-                mode_idx=0
+                mode_idx = 0
-                if self.out_mode!="valid":
+                if self.out_mode != "valid":
-                    mode_idx=1
+                    mode_idx = 1
                if all_shape:
                    time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
                else:
-                    time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx]
+                    time_unroll_patch = self.speed_unroll_patch_noshape[
+                        mode_idx]
                time_unroll_batch_kern = 9999999
                for i in xrange(len(self.speed_unroll_batch_kern)):
-                    if bsize%self.speed_unroll_batch_kern[i][0]==0 and nkern%self.speed_unroll_batch_kern[i][1]==0:
+                    if (bsize % self.speed_unroll_batch_kern[i][0] == 0 and
-                        if self.speed_unroll_batch_kern[i][2+mode_idx]<time_unroll_batch_kern:
+                        nkern % self.speed_unroll_batch_kern[i][1] == 0):
-                            time_unroll_batch_kern=self.speed_unroll_batch_kern[i][2+mode_idx]
+                        if self.speed_unroll_batch_kern[i][2 + mode_idx] < time_unroll_batch_kern:
-                            time_unroll_batch_kern_idx=i
+                            time_unroll_batch_kern = self.speed_unroll_batch_kern[i][2 + mode_idx]
+                            time_unroll_batch_kern_idx = i
                if time_unroll_patch < time_unroll_batch_kern:
                    self.unroll_patch = True
                else:
-                    self.unroll_batch=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][0]
+                    self.unroll_batch = self.speed_unroll_batch_kern[
-                    self.unroll_kern=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][1]
+                        time_unroll_batch_kern_idx][0]
+                    self.unroll_kern = self.speed_unroll_batch_kern[
+                        time_unroll_batch_kern_idx][1]
                    self.unroll_patch = False
            _logger.debug("AUTO FIND VERSION OF C_CODE OF CONV OP "
-                    "%s %s %s %s %s %s %s",
+                          "%s %s %s %s %s %s %s",
-                    self.unroll_batch, self.unroll_kern, self.unroll_patch,
+                          self.unroll_batch, self.unroll_kern,
-                    self.bsize, self.nkern, time_unroll_patch,
+                          self.unroll_patch,
-                    time_unroll_batch_kern)
+                          self.bsize, self.nkern, time_unroll_patch,
+                          time_unroll_batch_kern)
        self._rehash()
        if config.op.set_flops:
@@ -504,41 +544,46 @@ class ConvOp(Op):
        return self.__hashval
    def __str__(self):
-        return "ConvOp{" +",".join(str((a, getattr(self, a))) for a in self.__attrnames)  + "}"
+        return "ConvOp{" + ",".join(str((a, getattr(self, a)))
+                                    for a in self.__attrnames) + "}"
    def set_flops(self):
        """ Useful with the hack in profilemode to print the MFlops"""
-        if self.out_mode=="valid":
+        if self.out_mode == "valid":
-            self.flops=self.kshp[0]*self.kshp[1]*2#nb mul and add by output pixed
+            # nb mul and add by output pixed
-            self.flops*=self.outshp[0]*self.outshp[1]#nb flops by output image
+            self.flops = self.kshp[0] * self.kshp[1] * 2
-            self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
+            #nb flops by output image
-        else: #full mode not implemented
+            self.flops *= self.outshp[0] * self.outshp[1]
+            # for all outputs images#n_stack==self.imshp[0]
-            self.flops=0
+            self.flops *= self.imshp[0] * self.nkern * self.bsize
-            for out_row in xrange(self.outshp[0]):#loop over output row
+        else:  # full mode not implemented
-                for out_col in xrange(self.outshp[0]):#loop over output col
-                    for row in xrange(self.kshp[0]):#loop over kern row
+            self.flops = 0
+            for out_row in xrange(self.outshp[0]):  # loop over output row
-                        if (row+out_row-self.kshp[0]+1<0 or
+                for out_col in xrange(self.outshp[0]):  # loop over output col
-                            row+out_row-self.kshp[0]+1>=self.imshp[1]):
+                    for row in xrange(self.kshp[0]):  # loop over kern row
+                        if (row + out_row - self.kshp[0] + 1 < 0 or
+                            row + out_row - self.kshp[0] + 1 >= self.imshp[1]):
                            continue
-                        col=0
+                        col = 0
-                        max_col=self.kshp[1]
+                        max_col = self.kshp[1]
-                        img_col=out_col-self.kshp[1]+1
+                        img_col = out_col - self.kshp[1] + 1
-                        max_col=min(max_col,self.imshp[2]-img_col)
+                        max_col = min(max_col, self.imshp[2] - img_col)
-                        if img_col<0:
+                        if img_col < 0:
-                            col=-img_col
+                            col = -img_col
-                            img_col+=col
+                            img_col += col
-                        while col < max_col: #loop over kern col
+                        while col < max_col:  # loop over kern col
-                            self.flops+=2
+                            self.flops += 2
-                            col+=1
+                            col += 1
+            # for all outputs images#n_stack==self.imshp[0]
-            self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
+            self.flops *= self.imshp[0] * self.nkern * self.bsize
            assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
-                    self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
+                    self.kshp[0] * self.kshp[1] * \
+                        self.imshp[1] * self.imshp[2] * 2
    def make_node(self, inputs, kerns):
        # TODO: find a way to make ConvOp work for N-D (after NIPS09)
@@ -551,19 +596,23 @@ class ConvOp(Op):
        _kerns = as_tensor_variable(kerns)
        # TODO: lift this restriction by upcasting either inputs or kerns
        if _inputs.ndim != 4:
-            raise TypeError('ConvOp (make_node) requires input be a 4D tensor; received "%s" (%i dims)' % (inputs, _inputs.ndim))
+            raise TypeError('ConvOp (make_node) requires input be a 4D tensor;'
+                            ' received "%s" (%i dims)' %
+                            (inputs, _inputs.ndim))
        if _kerns.ndim != 4:
            raise TypeError('make_node requires 4D tensor of kernels')
        if _inputs.type.dtype != _kerns.type.dtype:
-            raise NotImplementedError("The image and the kernel must have the same type."
+            raise NotImplementedError(
-                            "inputs(%s), kerns(%s)"%(_inputs.dtype, _kerns.dtype))
+                "The image and the kernel must have the same type."
+                "inputs(%s), kerns(%s)" % (_inputs.dtype, _kerns.dtype))
        if self.outshp is not None:
-            bcastable23 = [self.outshp[0]==1, self.outshp[1]==1]
+            bcastable23 = [self.outshp[0] == 1, self.outshp[1] == 1]
        else:
            bcastable23 = [False, False]
        output = theano.tensor.tensor(dtype=_inputs.type.dtype,
                                      broadcastable=[_inputs.broadcastable[0],
-                                                     _kerns.broadcastable[0]]+bcastable23);
+                                                     _kerns.broadcastable[0]] +
+                                                     bcastable23)
        return Apply(self, [_inputs, _kerns], [output])
@@ -582,10 +631,12 @@ class ConvOp(Op):
            if self.kshp_logical:
                kshp = self.kshp_logical
            try:
-                fmshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx,self.dy), self.out_mode)
+                fmshp = ConvOp.getOutputShape(imshp[1:],
+                                              kshp, (self.dx, self.dy),
+                                              self.out_mode)
            except TypeError:
                raise theano.tensor.ShapeError()
-            outshp = (batch_size,fmo) + tuple(fmshp)
+            outshp = (batch_size, fmo) + tuple(fmshp)
            return [outshp]
        else:
            # Haven't implemented this case. imshp and kshp may be symbollic
@@ -593,8 +644,7 @@ class ConvOp(Op):
            # we simply let the default function do its work.
            raise theano.tensor.ShapeError()
+    def perform(self, node, inp, out):
-    def perform(self,node, inp, out):
        """
        By default if len(img2d.shape)==3, we
        """
@@ -603,9 +653,12 @@ class ConvOp(Op):
        if not imported_scipy_signal:
            raise theano.gof.utils.MethodNotDefined(
                "c_headers", type(self), self.__class__.__name__,
-                "Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
+                "Need the python package for scipy.signal to be installed "
+                "for the python implementation. You can use the C"
+                " implementation instead.")
-        # TODO: move these back out to global scope when they no longer cause an atexit error
+        # TODO: move these back out to global scope when they no longer
+        #       cause an atexit error
        imshp = self.imshp
        if imshp is None or any([x is None for x in imshp]):
            imshp = tuple(img2d.shape[1:])
@@ -634,39 +687,43 @@ class ConvOp(Op):
        if self.fulloutshp is not None:
            fulloutshp = tuple(self.fulloutshp)
        else:
-            fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[1:], kshp_logical, (1,1), self.out_mode))
+            fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical[
+                1:], kshp_logical, (1, 1), self.out_mode))
-        if z[0] is None or z[0].shape!=(bsize,)+(nkern,)+fulloutshp:
+        if z[0] is None or z[0].shape != (bsize,) + (nkern,) + fulloutshp:
-            z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
+            z[0] = numpy.zeros((bsize,) + (nkern,) + fulloutshp,
-                           dtype=img2d.dtype)
+                               dtype=img2d.dtype)
-        zz=z[0]
+        zz = z[0]
        stacklen = imshp[0]
-        img2d = img2d.reshape((bsize,)+ imshp)
+        img2d = img2d.reshape((bsize,) + imshp)
-        filtersflipped = filtersflipped.reshape((nkern,stacklen)+kshp)
+        filtersflipped = filtersflipped.reshape((nkern, stacklen) + kshp)
        if self.imshp != self.imshp_logical:
            # assuming that to get from imshp to imshp logical we insert zeros in missing spots
            rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
            cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
-            buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
+            buf = numpy.zeros((bsize,) + imshp_logical, dtype=img2d.dtype)
-            buf[:,:,::rstride, ::cstride] = img2d
+            buf[:, :, ::rstride, ::cstride] = img2d
            img2d = buf
            del buf, rstride, cstride
        if kshp != kshp_logical:
            rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
            cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
-            buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
+            buf = numpy.zeros((nkern, stacklen) +
+                              self.kshp_logical, dtype=filtersflipped.dtype)
            if self.kshp_logical_top_aligned:
-                roffset=coffset=0
+                roffset = coffset = 0
            else:
-                roffset=(kshp_logical[0] - (kshp[0]*rstride) - 1+rstride) % rstride
+                roffset = (kshp_logical[0] - (kshp[0] *
-                coffset=(kshp_logical[1] - (kshp[1]*cstride) - 1+cstride) % cstride
+                                              rstride) - 1 + rstride) % rstride
+                coffset = (kshp_logical[1] - (kshp[1] *
+                                              cstride) - 1 + cstride) % cstride
                assert roffset >= 0
                assert coffset >= 0
-            buf[:,:,roffset::rstride, coffset::cstride] = filtersflipped
+            buf[:, :, roffset::rstride, coffset::cstride] = filtersflipped
            filtersflipped = buf
            del buf, rstride, cstride
@@ -675,39 +732,39 @@ class ConvOp(Op):
        for b in xrange(bsize):
            for n in xrange(nkern):
-                zz[b,n,...].fill(0)
+                zz[b, n, ...].fill(0)
                for im0 in xrange(stacklen):
-                    zz[b,n,...] +=  _convolve2d(\
+                    zz[b, n, ...] += _convolve2d(img2d[b, im0, ...],
-                        img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
+                                                 filtersflipped[n, im0, ...],
+                                                 1, val, bval, 0)
        if False:
-            if False and self.out_mode=="full":
+            if False and self.out_mode == "full":
-                img2d2 = numpy.zeros((bsize,stacklen,
+                img2d2 = numpy.zeros((bsize, stacklen,
-                                      imshp[1]+2*kshp[0]-2,
+                                      imshp[1] + 2 * kshp[0] - 2,
-                                      imshp[2]+2*kshp[1]-2))
+                                      imshp[2] + 2 * kshp[1] - 2))
-                img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1],
+                img2d2[:, :, kshp[0] - 1:kshp[0] - 1 + imshp[1],
-                           kshp[1]-1:kshp[1]-1+imshp[2]] = img2d
+                       kshp[1] - 1:kshp[1] - 1 + imshp[2]] = img2d
                img2d = img2d2
            #N_image_shape = image_data.shape
            for b in xrange(bsize):
                for n in xrange(nkern):
-                    zz[b,n,...].fill(0)
+                    zz[b, n, ...].fill(0)
                    for im0 in xrange(stacklen):
-                        for row in xrange(0,zz.shape[2],self.dx):
+                        for row in xrange(0, zz.shape[2], self.dx):
-                            for col in xrange(0,zz.shape[3],self.dy):
+                            for col in xrange(0, zz.shape[3], self.dy):
-                                zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\
+                                zz[b, n, row, col] += (img2d[b, im0, row:row + kshp[0], col:col + kshp[1]] *
-                                                            filtersflipped[n,im0,::-1,::-1]).sum()
+                                                            filtersflipped[n, im0, ::-1, ::-1]).sum()
        #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
        #The copy make that we return an object with the same stride as the c version.
        #The copy don't affect the performence during our experience as in that case we
        #execute the c version which is much faster.
-        if self.dx>1 or self.dy>1:
+        if self.dx > 1 or self.dy > 1:
-            zz = zz[:,:,0::self.dx,0::self.dy].copy()
+            zz = zz[:, :, 0::self.dx, 0::self.dy].copy()
-        z[0]=zz
+        z[0] = zz
    def grad(self, inp, grads):
        inputs, kerns = inp
@@ -724,34 +781,38 @@ class ConvOp(Op):
            # build a "node", that should be equivalent to the one given by
            # self.make_node, but using conv3D instead of self.
            tmp_node = theano.tensor.nnet.conv3D(
-                    V=inputs.dimshuffle(0, 2, 3, 'x', 1),
+                V=inputs.dimshuffle(0, 2, 3, 'x', 1),
-                    W=kerns[:, :, ::-1, ::-1].dimshuffle(0, 2, 3, 'x', 1),
+                W=kerns[:, :, ::-1, ::-1].dimshuffle(0, 2, 3, 'x', 1),
-                    b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype), kerns.shape[0]),
+                b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype),
-                    d=(self.dx, self.dy, 1))
+                                      kerns.shape[0]),
-            node = theano.tensor.addbroadcast(tmp_node, 3).dimshuffle(0, 4, 1, 2)
+                d=(self.dx, self.dy, 1))
+            node = theano.tensor.addbroadcast(
+                tmp_node, 3).dimshuffle(0, 4, 1, 2)
            # mimic what happens inside theano.grad: get the input gradient
            # of the final cost wrt all variables involved.
-            tmp_gmap = theano.gradient.grad_sources_inputs([(node, gz)], [inputs, kerns])
+            tmp_gmap = theano.gradient.grad_sources_inputs(
+                [(node, gz)], [inputs, kerns])
            return [tmp_gmap[inputs], tmp_gmap[kerns]]
        if self.dx not in (1, 2) or self.dy not in (1, 2):
-            raise NotImplementedError("ERROR: We disable ConvOp.grad now when dx or "\
+            raise NotImplementedError(
-                    "dy are different from 1 and 2, as there is a bug in it.")
+                "ERROR: We disable ConvOp.grad now when dx or "
+                "dy are different from 1 and 2, as there is a bug in it.")
-        all_shape = self.imshp is not None and self.kshp is not None and \
+        all_shape = (self.imshp is not None and self.kshp is not None and
-                    self.nkern is not None and self.bsize is not None
+                     self.nkern is not None and self.bsize is not None)
-        if not all_shape and (self.dx!=1 or self.dy!=1):
+        if not all_shape and (self.dx != 1 or self.dy != 1):
-            raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
+            raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "
                            "the optional shape information")
        ####### Determine gradient on kernels ########
-        assert inputs.ndim==4 and kerns.ndim==4
+        assert inputs.ndim == 4 and kerns.ndim == 4
-        newin = inputs.dimshuffle((1,0,2,3))
+        newin = inputs.dimshuffle((1, 0, 2, 3))
-        newgz = gz.dimshuffle((1,0,2,3))
+        newgz = gz.dimshuffle((1, 0, 2, 3))
        (bsize, nkern) = None, None
        imshp = None
@@ -762,48 +823,55 @@ class ConvOp(Op):
        if self.out_mode == 'valid':
            (img, filters) = (newin, newgz)
            kshp_logical = self.fulloutshp
-            kshp_logical_top_aligned=False
+            kshp_logical_top_aligned = False
            if all_shape:
                (bsize, nkern) = (self.imshp[0], self.nkern)
                imshp = (self.bsize, self.imshp[1], self.imshp[2])
-            kshp  = self.outshp
+            kshp = self.outshp
            un_b = self.unroll_batch
            un_k = self.unroll_kern
        elif self.out_mode == 'full':
            (img, filters) = (newgz, newin)
            kshp_logical = None
-            kshp_logical_top_aligned=True
+            kshp_logical_top_aligned = True
            if all_shape:
-                imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1])
+                imshp_logical = (self.bsize,
+                                 self.fulloutshp[0],
+                                 self.fulloutshp[1])
                (bsize, nkern) = (self.nkern, self.imshp[0])
                imshp = (self.bsize, self.outshp[0], self.outshp[1])
-                kshp  = self.imshp[1:]
+                kshp = self.imshp[1:]
            un_b = self.unroll_kern
            un_k = self.unroll_batch
        else:
-            raise NotImplementedError('Only [full,valid] modes are currently supported.')
+            raise NotImplementedError(
+                'Only [full,valid] modes are currently supported.')
-        filters = filters[:,:,::-1,::-1] #flip them
+        filters = filters[:, :, ::-1, ::-1]  # flip them
-        if 0: #find good value for the unroll
+        if 0:  # find good value for the unroll
-            if all_shape and un_b!=0 and bsize%un_b!=0:
+            if all_shape and un_b != 0 and bsize % un_b != 0:
-                if bsize<un_b:
+                if bsize < un_b:
                    un_b = bsize
                else:
                    un_b = 1
-                    _logger.warn("Optimization Warning: in ConvOp.grad() we can't determine "\
+                    _logger.warn(
-                          "a good unroll value for the batch. Maybe you can optimize this!")
+                        "Optimization Warning: in ConvOp.grad() we can't "
+                        " determine a good unroll value for the batch."
+                        " Maybe you can optimize this!")
-            if all_shape and un_k!=0 and nkern%un_k!=0:
+            if all_shape and un_k != 0 and nkern % un_k != 0:
-                if nkern<un_k:
+                if nkern < un_k:
                    un_k = nkern
                else:
                    un_k = 1
-                    _logger.warn("Optimization Warning: in ConvOp.grad() we can't determine "\
+                    _logger.warn(
-                          "a good unroll value for the kernel. Maybe you can optimize this!")
+                        "Optimization Warning: in ConvOp.grad() we can't"
+                        " determine a good unroll value for the kernel. Maybe"
+                        " you can optimize this!")
-            dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
+            dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
                        unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
                        imshp_logical=imshp_logical,
                        kshp_logical=kshp_logical,
@@ -811,8 +879,8 @@ class ConvOp(Op):
                        version=self.version,
                        verbose=self.verbose)
-        else: # let __init__ choose c params be chosen automatically from shapes
+        else:  # let __init__ choose c params be chosen automatically from shapes
-            dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
+            dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
                        unroll_batch=None, unroll_kern=None, unroll_patch=None,
                        imshp_logical=imshp_logical,
                        kshp_logical=kshp_logical,
@@ -820,26 +888,25 @@ class ConvOp(Op):
                        version=self.version,
                        verbose=self.verbose)
+        if hasattr(self, 'flops'):
-        if hasattr(self,'flops'):
            dw.set_flops()
-        dw = dw(img,filters)
+        dw = dw(img, filters)
        if all_shape:
-            assert (dw.owner.op.outshp==self.kshp).all()
+            assert (dw.owner.op.outshp == self.kshp).all()
        if self.out_mode == 'valid':
            # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
-            dw = dw.dimshuffle((1,0,2,3))
+            dw = dw.dimshuffle((1, 0, 2, 3))
-            dw = dw[:,:,::-1,::-1]
+            dw = dw[:, :, ::-1, ::-1]
        ####### Determine gradient on inputs ########
        mode = 'valid'
        if not self.out_mode == 'full':
            mode = 'full'
-        filters = kerns.dimshuffle((1,0,2,3))
+        filters = kerns.dimshuffle((1, 0, 2, 3))
-        filters = filters[:,:,::-1,::-1]
+        filters = filters[:, :, ::-1, ::-1]
        nkern = None
        imshp = None
        imshp_logical = None
@@ -848,33 +915,36 @@ class ConvOp(Op):
        if all_shape:
            nkern = self.imshp[0]
            imshp = (self.nkern, self.outshp[0], self.outshp[1])
-            imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1])
+            imshp_logical = (self.nkern, self.fulloutshp[0],
+                             self.fulloutshp[1])
-        if 0: # hard-code c generation parameters
+        if 0:  # hard-code c generation parameters
            din = ConvOp(imshp, self.kshp, nkern, self.bsize,
-                         1,1, output_mode=mode,
+                         1, 1, output_mode=mode,
-                         unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
+                         unroll_batch=un_b, unroll_kern=un_k,
+                         unroll_patch=un_p,
                         imshp_logical=imshp_logical,
                         kshp_logical=None,
-                         version=-1,#we we change the mode, we don't forward the version.
+                         version=-1,  # we we change the mode, we don't forward the version.
                         verbose=self.verbose)
-        else: # let __init__ figure out the unrolling / patch sizes
+        else:  # let __init__ figure out the unrolling / patch sizes
            din = ConvOp(imshp, self.kshp, nkern, self.bsize,
-                         1,1, output_mode=mode,
+                         1, 1, output_mode=mode,
-                         unroll_batch=None, unroll_kern=None, unroll_patch=None,
+                         unroll_batch=None, unroll_kern=None,
+                         unroll_patch=None,
                         imshp_logical=imshp_logical,
                         kshp_logical=None,
-                         version=-1,#we we change the mode, we don't forward the version.
+                         version=-1,  # we we change the mode, we don't forward the version.
                         verbose=self.verbose)
-        if hasattr(self,'flops'):
+        if hasattr(self, 'flops'):
            din.set_flops()
-        din = din(gz,filters)
+        din = din(gz, filters)
        assert (din.owner.op.outshp is None and self.imshp is None) or \
               (din.owner.op.outshp is None) or \
-               (din.owner.op.outshp==self.imshp[1:]).all()
+               (din.owner.op.outshp == self.imshp[1:]).all()
        # din and dw should have the same broadcasting pattern as the
        # parameters they are the gradient of (resp. inputs and kerns).
@@ -902,10 +972,14 @@ using namespace std;
        """ Return True if we will generate code that use gemm.
        """
        #the gemm version only support that case
-        if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
+        if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
            #We use a faster version in those case.
-            if (self.imshp != self.imshp_logical or self.kshp != self.kshp_logical
+            if (self.imshp != self.imshp_logical or
-                or self.unroll_patch or self.unroll_batch>0 or self.unroll_kern>0):
+                self.kshp != self.kshp_logical or
+                self.unroll_patch or
+                self.unroll_batch > 0 or
+                self.unroll_kern > 0):
                return False
            return True
        return False
@@ -918,7 +992,9 @@ using namespace std;
    def c_no_compile_args(self):
        #when the ksph==(1,1) gcc 4.3.0 segfault during the
        #compilation with -O3.  This don't happen at -O2
-        if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1):
+        if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
+            self.kshp == (1, 1)):
            return ['-O3']
        else:
            return []
@@ -928,10 +1004,11 @@ using namespace std;
        if self.use_blas():
            ret = blas.ldflags(libs=False, flags=True)
-        if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1):
+        if (theano.gof.cmodule.gcc_version() in ['4.3.0'] and
+            self.kshp == (1, 1)):
            ret += ['-O2']
-        if self.openmp:
+        #Add the -fopenmp flags
-            ret += ['-fopenmp']
+        ret += super(ConvOp, self).c_compile_args()
        return ret
@@ -951,123 +1028,140 @@ using namespace std;
        if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
            raise NotImplementedError()
        assert node.inputs[0].type.dtype == node.inputs[1].type.dtype
-        d=locals()
+        d = locals()
        d.update(sub)
-        all_shape = self.imshp is not None and self.kshp is not None and \
+        all_shape = (self.imshp is not None and self.kshp is not None and
-                    self.nkern is not None and self.bsize is not None
+                     self.nkern is not None and self.bsize is not None)
-        d["self_out_mode"]=self.out_mode
+        d["self_out_mode"] = self.out_mode
-        d["self_dx"]=self.dx
+        d["self_dx"] = self.dx
-        d["self_dy"]=self.dy
+        d["self_dy"] = self.dy
-        d["mode"]=self.out_mode.upper()
+        d["mode"] = self.out_mode.upper()
-        d["affectation"]="="
+        d["affectation"] = "="
        if all_shape:
-            d["self_bsize"]=self.bsize
+            d["self_bsize"] = self.bsize
-            d["self_nkern"]=self.nkern
+            d["self_nkern"] = self.nkern
-            d["self_outshp0"]=self.outshp[0]
+            d["self_outshp0"] = self.outshp[0]
-            d["self_outshp1"]=self.outshp[1]
+            d["self_outshp1"] = self.outshp[1]
-            d["self_imshp0"]=self.imshp[0]
+            d["self_imshp0"] = self.imshp[0]
-            d["self_imshp1"]=self.imshp[1]
+            d["self_imshp1"] = self.imshp[1]
-            d["self_imshp2"]=self.imshp[2]
+            d["self_imshp2"] = self.imshp[2]
-            d["self_kshp0"]=self.kshp[0]
+            d["self_kshp0"] = self.kshp[0]
-            d["self_kshp1"]=self.kshp[1]
+            d["self_kshp1"] = self.kshp[1]
            d["self_kshp_logical_r"] = self.kshp_logical[0]
            d["self_kshp_logical_c"] = self.kshp_logical[1]
-            d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0])))
+            d["self_kshp_logical_stride_r"] = int(numpy.ceil(
-            d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1])))
+                self.kshp_logical[0] / float(self.kshp[0])))
-            d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1  not 0
+            d["self_kshp_logical_stride_c"] = int(numpy.ceil(
-            d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2  not 1
+                self.kshp_logical[1] / float(self.kshp[1])))
-            d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1])))
+            d["self_imshp_logical_r"] = self.imshp_logical[1]
-            d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2])))
+                #numpy.B. 1  not 0
-            if not self.imshp[0]==1: d["affectation"]="+="
+            d["self_imshp_logical_c"] = self.imshp_logical[2]
-            d["all_shape"]="1"
+                # numpy.B. 2  not 1
-            d["dim_zz_const"]="const"
+            d["self_imshp_logical_stride_r"] = int(numpy.ceil(
-            d["dim_zz_affect"]=""
+                self.imshp_logical[1] / float(self.imshp[1])))
-            d["assert_size"]="""
+            d["self_imshp_logical_stride_c"] = int(numpy.ceil(
+                self.imshp_logical[2] / float(self.imshp[2])))
+            if not self.imshp[0] == 1:
+                d["affectation"] = "+="
+            d["all_shape"] = "1"
+            d["dim_zz_const"] = "const"
+            d["dim_zz_affect"] = ""
+            d["assert_size"] = """
 // Check the batch size and the number of kernels (sometimes constant in the graph)
 if(img2d_dim[0] != %(self_bsize)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the batch size in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the batch size in the image (%%ld) at run time is different"
-      (long)img2d_dim[0], (long)%(self_bsize)s);
+            " than at build time (%%ld) for the ConvOp.",
+            (long)img2d_dim[0], (long)%(self_bsize)s);
    %(fail)s;
 }
 if(kerns_dim[0] != %(self_nkern)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the number of kernels in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the number of kernels in the filter (%%ld) at run time is"
-      (long)kerns_dim[0], (long)%(self_nkern)s);
+            " different than at build time (%%ld) for the ConvOp.",
+            (long)kerns_dim[0], (long)%(self_nkern)s);
    %(fail)s;
 }
 // Check the size of the image (sometimes constant in the graph)
 if(img2d_dim[1] != %(self_imshp0)s){
    PyErr_Format(PyExc_ValueError,
-      "the image stack size (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the image stack size (%%ld) at run time is different than"
-      (long)img2d_dim[1], (long)%(self_imshp0)s);
+            " at build time (%%ld) for the ConvOp.",
+            (long)img2d_dim[1], (long)%(self_imshp0)s);
    %(fail)s;
 }
 if(img2d_dim[2] != %(self_imshp1)s){
    PyErr_Format(PyExc_ValueError,
-      "the number of rows in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the number of rows in the image (%%ld) at run time is different"
-      (long)img2d_dim[2], (long)%(self_imshp1)s);
+            " than at build time (%%ld) for the ConvOp.",
+            (long)img2d_dim[2], (long)%(self_imshp1)s);
    %(fail)s;
 }
 if(img2d_dim[3] != %(self_imshp2)s){
    PyErr_Format(PyExc_ValueError,
-      "the number of columns in the image (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the number of columns in the image (%%ld) at run time is"
-      (long)img2d_dim[3], (long)%(self_imshp2)s);
+            " different than at build time (%%ld) for the ConvOp.",
+            (long)img2d_dim[3], (long)%(self_imshp2)s);
    %(fail)s;
 }
 // Check the size of the output (sometimes constant in the graph)
 if(dim_zz[0] != %(self_outshp0)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the precomputed number of rows in the output (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the precomputed number of rows in the output (%%ld) at run time"
-      (long)dim_zz[0], (long)%(self_outshp0)s);
+            " is different than at build time (%%ld) for the ConvOp.",
+            (long)dim_zz[0], (long)%(self_outshp0)s);
    %(fail)s;
 }
 if(dim_zz[1] != %(self_outshp1)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the precomputed number of columns in the output (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the precomputed number of columns in the output (%%ld) at run"
-      (long)dim_zz[1], (long)%(self_outshp1)s);
+            " time is different than at build time (%%ld) for the ConvOp.",
+            (long)dim_zz[1], (long)%(self_outshp1)s);
    %(fail)s;
 }
 // Check the size of the filter (sometimes constant in the graph)
 if(kerns_dim[1] %% %(self_imshp0)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the filter stack size (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the filter stack size (%%ld) at run time is different than at"
-      (long)kerns_dim[1], (long)%(self_imshp0)s);
+            " build time (%%ld) for the ConvOp.",
+            (long)kerns_dim[1], (long)%(self_imshp0)s);
    %(fail)s;
 }
 if(kerns_dim[2] %% %(self_kshp0)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the number of rows in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the number of rows in the filter (%%ld) at run time is different"
-      (long)kerns_dim[2], (long)%(self_kshp0)s);
+            " than at build time (%%ld) for the ConvOp.",
+            (long)kerns_dim[2], (long)%(self_kshp0)s);
    %(fail)s;
 }
 if(kerns_dim[3] %% %(self_kshp1)s!=0){
    PyErr_Format(PyExc_ValueError,
-      "the number of columns in the filter (%%ld) at run time is different than at build time (%%ld) for the ConvOp.",
+            "the number of columns in the filter (%%ld) at run time is"
-      (long)kerns_dim[3], (long)%(self_kshp1)s);
+            " different than at build time (%%ld) for the ConvOp.",
+            (long)kerns_dim[3], (long)%(self_kshp1)s);
    %(fail)s;
 }
-"""%(locals())
+""" % (locals())
        else:
-            d["self_bsize"]="%(img2d)s->dimensions[0]"%d
+            d["self_bsize"] = "%(img2d)s->dimensions[0]" % d
-            d["self_nkern"]="%(filtersflipped)s->dimensions[0]"%d
+            d["self_nkern"] = "%(filtersflipped)s->dimensions[0]" % d
-            d["self_outshp0"]="-1"
+            d["self_outshp0"] = "-1"
-            d["self_outshp1"]="-1"
+            d["self_outshp1"] = "-1"
-            d["self_imshp0"]="%(img2d)s->dimensions[1]"%d
+            d["self_imshp0"] = "%(img2d)s->dimensions[1]" % d
-            d["self_imshp1"]="%(img2d)s->dimensions[2]"%d
+            d["self_imshp1"] = "%(img2d)s->dimensions[2]" % d
-            d["self_imshp2"]="%(img2d)s->dimensions[3]"%d
+            d["self_imshp2"] = "%(img2d)s->dimensions[3]" % d
-            d["self_kshp0"]="%(filtersflipped)s->dimensions[2]"%d
+            d["self_kshp0"] = "%(filtersflipped)s->dimensions[2]" % d
-            d["self_kshp1"]="%(filtersflipped)s->dimensions[3]"%d
+            d["self_kshp1"] = "%(filtersflipped)s->dimensions[3]" % d
-            d["affectation"]="+="
+            d["affectation"] = "+="
-            d["all_shape"]="0"
+            d["all_shape"] = "0"
-            d["dim_zz_const"]=""
+            d["dim_zz_const"] = ""
-            d["dim_zz_affect"]="""
+            d["dim_zz_affect"] = """
  if (mode == FULL) {
    dim_zz[0] = (int)ceil((dim_im[0]+dim_ker0-1)/float(%(self_dx)s));
    dim_zz[1] = (int)ceil((dim_im[1]+dim_ker1-1)/float(%(self_dy)s));
@@ -1075,8 +1169,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
    dim_zz[0] = (int)ceil((dim_im[0]-dim_ker0+1)/float(%(self_dx)s));
    dim_zz[1] = (int)ceil((dim_im[1]-dim_ker1+1)/float(%(self_dy)s));
  }
-"""% d
+""" % d
-            d["assert_size"]=""
+            d["assert_size"] = ""
        if self.kshp_logical_top_aligned:
            d["self_kshp_logical_offset_r"] = 0
@@ -1084,36 +1178,47 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
        elif all_shape:
            rstride = d["self_kshp_logical_stride_r"]
            cstride = d["self_kshp_logical_stride_c"]
-            d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride
+            d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] -
-            d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride
+                                               (self.kshp[0] * rstride) -
+                                               1 + rstride) % rstride
+            d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] -
+                                               (self.kshp[1] * cstride) -
+                                               1 + cstride) % cstride
            del rstride, cstride
-        if node.inputs[0].type.dtype=="float32": d["type"]="float"
+        if node.inputs[0].type.dtype == "float32":
-        elif node.inputs[0].type.dtype=="float64": d["type"]="double"
+            d["type"] = "float"
-        else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
+        elif node.inputs[0].type.dtype == "float64":
-        d["gemm"]='dgemm_'
+            d["type"] = "double"
-        if not d["type"]=="double":d["gemm"]='sgemm_'
+        else:
+            raise Exception("Type %s not implemented" %
+                            node.inputs[0].type.dtype)
+        d["gemm"] = 'dgemm_'
+        if not d["type"] == "double":
+            d["gemm"] = 'sgemm_'
        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
            if self.verbose:
-                _logger.debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version")
+                _logger.debug("return imshp!=imshp_logical or"
+                              " self.kshp != self.kshp_logical shape version")
            return _conv_op_code_a % d
        if self.unroll_patch:
            if self.verbose:
-                _logger.debug("return unroll patch version. all_shape=%s", all_shape)
+                _logger.debug("return unroll patch version. all_shape=%s",
-            return _conv_op_code_unroll_patch%d
+                              all_shape)
-        if self.unroll_batch>0 or self.unroll_kern>0:
+            return _conv_op_code_unroll_patch % d
-            assert self.unroll_batch>0
+        if self.unroll_batch > 0 or self.unroll_kern > 0:
-            assert self.unroll_kern>0
+            assert self.unroll_batch > 0
+            assert self.unroll_kern > 0
            if self.verbose:
                _logger.debug("return unrolled batch (%s) and kern code (%s)",
-                        str(self.unroll_batch), str(self.unroll_kern))
+                              str(self.unroll_batch), str(self.unroll_kern))
            return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
                                                   self.unroll_kern)
        #TODO: should we choose the unroll size automatically with the bigger divisor under 5?
-        if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
+        if self.out_mode == 'valid' and self.dx == 0 and self.dy == 0:
            if self.verbose:
                _logger.debug("return gemm version")
            return _conv_op_code_valid_gemm % d
@@ -1126,7 +1231,8 @@ if(kerns_dim[3] %% %(self_kshp1)s!=0){
 _conv_op_code_a = """
 const int mode=%(mode)s;
 int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
+PyArrayObject *ain1=NULL, *ain2=NULL;
+PyArrayObject *filtersflipped_arr=NULL, *img2d_arr=NULL;
 const %(type)s fill_value = 0;
 int type_im=PyArray_TYPE(%(img2d)s);
@@ -1216,12 +1322,17 @@ if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s))
 filtersflipped_arr = (PyArrayObject*)filtersflipped;
 if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
+  PyErr_SetString(PyExc_ValueError,
+                  "invalid mode, only full and valid are supported");
+  %(fail)s;
 }
 typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
 typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
 if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
+if (typenum != typenum_f) {
+  PyErr_SetString(PyExc_ValueError, "Input types must match");
+  %(fail)s;
+}
 if (!img2d) %(fail)s;
 if (!filtersflipped) %(fail)s;
@@ -1249,10 +1360,19 @@ Os[0]=%(self_outshp0)s;
 Os[1]=%(self_outshp1)s;
 //assertions
-if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[0] != %(z)s->dimensions[1] *
-if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+                         %(z)s->dimensions[2] *
-if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+                         %(z)s->dimensions[3] *
-if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
+                         (npy_intp)sizeof(%(type)s))
+    %(fail)s;
+if (%(z)s->strides[1] != %(z)s->dimensions[2] *
+                         %(z)s->dimensions[3] *
+                         (npy_intp)sizeof(%(type)s))
+    %(fail)s;
+if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s))
+    %(fail)s;
+if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s))
+    %(fail)s;
 for(int b=0;b< %(self_bsize)s;b++){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
@@ -1267,34 +1387,41 @@ for(int b=0;b< %(self_bsize)s;b++){
      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-                                               /// Reposition index into input image based on requested output size
+        // Reposition index into input image based on requested output size
-        int pos_m = iter_m*%(self_dx)s;        //row position in logical output image
+        //row position in logical output image
-        int new_m;                             //row anchor in logical input image (we will loop upward from here)
+        int pos_m = iter_m*%(self_dx)s;
+        //row anchor in logical input image (we will loop upward from here)
+        int new_m;
        if (mode == FULL) new_m = pos_m ;
        else new_m = (pos_m+dim_ker_log[0]-1);
        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-          int pos_n=iter_n*%(self_dy)s;        // current col position in logical output image
+          // current col position in logical output image
+          int pos_n=iter_n*%(self_dy)s;
          %(type)s sum=0;
          // Sum over kernel, if index into image is out of bounds
          // fill with the value
-          for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) { // loop over logical rows in kernel
+          // loop over logical rows in kernel
+          for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) {
+            // ind0_log: row position in logical input image
+            int ind0_log = (new_m-j_log);
-            int ind0_log = (new_m-j_log);                                   // ind0_log: row position in logical input image
+            if ((j_log < %(self_kshp_logical_offset_r)s) ||
+                (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
-            if ((j_log < %(self_kshp_logical_offset_r)s) || (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
                continue;
            if (ind0_log MOD %(self_imshp_logical_stride_r)s)
                continue;
-            int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) / %(self_kshp_logical_stride_r)s);
+            int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) /
+                          %(self_kshp_logical_stride_r)s);
            int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s);
            //std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n";
            if(mode==FULL){
-              const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]]; //This is a pointer to the current row of the kernel
+              //This is a pointer to the current row of the kernel
+              const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
              if(ind0_log < 0 || ind0_log >= dim_im_log[0]){
                   // the current row of the kernel is off the image
              }else{
@@ -1304,30 +1431,40 @@ for(int b=0;b< %(self_bsize)s;b++){
                for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) {
                    if (1)
                    {
-                                if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
+                                if ((k < %(self_kshp_logical_offset_c)s) ||
+                                    (k - %(self_kshp_logical_offset_c)s) MOD
+                                    %(self_kshp_logical_stride_c)s)
                                    continue;
-                                if (ind1_log MOD %(self_imshp_logical_stride_c)s)
+                                if (ind1_log MOD
+                                    %(self_imshp_logical_stride_c)s)
                                    continue;
                    }
-                  sum+= idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s] * idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
+                  sum += idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
+                                   %(self_kshp_logical_stride_c)s] *
+                            idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
                }
              }
-            }else{
+            }else{ // mode==VALID
-              const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
+              //JB: should be dim_im[1] right? (was dim_im[0])
+              const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]];
              const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
              int new_n = (pos_n+dim_ker_log[1]-1);
              if (%(self_imshp_logical_stride_c)s != 1)  // a general loop
              {
                  for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) {
-                        if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
+                        if ((k < %(self_kshp_logical_offset_c)s) ||
+                            (k - %(self_kshp_logical_offset_c)s) MOD
+                            %(self_kshp_logical_stride_c)s)
                            continue;
                        else if (last MOD %(self_imshp_logical_stride_c)s)
                            continue;
                            else
                            {
-                    sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s]*idx_in[last/%(self_imshp_logical_stride_c)s];
+                    sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
+                                   %(self_kshp_logical_stride_c)s] *
+                             idx_in[last/%(self_imshp_logical_stride_c)s];
                    }
                  }
              }
@@ -1335,7 +1472,8 @@ for(int b=0;b< %(self_bsize)s;b++){
              {
                  int offset = %(self_kshp_logical_offset_c)s;
                  int k_phys=0;
-                  for (int k_log=offset,last=new_n-offset; k_log < dim_ker_log[1]; ) {
+                  for (int k_log=offset,last=new_n-offset;
+                       k_log < dim_ker_log[1]; ) {
                    sum += idx_hvals[k_phys]*idx_in[last];
                    ++k_phys;
                    last -= %(self_kshp_logical_stride_c)s;
@@ -1343,10 +1481,10 @@ for(int b=0;b< %(self_bsize)s;b++){
                  }
              }
            }
-          }//for j
+          }//for j_log
          out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-        }//for n
+        }//for iter_n
-      }//for m
+      }//for iter_m
    }//for stack_size
    if (0 && (mode==FULL)){
      for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
@@ -1585,33 +1723,36 @@ free(kbuf);
 Py_XDECREF(img2d);
 """
-def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
+def gen_conv_code_unroll_batch_kern(d, unroll_bsize=1, unroll_ksize=1):
    """ c_code for ConvOp that unroll the batch size loop
    """
-    assert unroll_bsize>0 and unroll_ksize>0
+    assert unroll_bsize > 0 and unroll_ksize > 0
-    if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"):
+    if "unroll_bsize" in d or "unroll_ksize" in d or "unroll_iter" in d or "unroll_biter" in d or "unroll_kiter" in d:
        raise Exception("We can't use this dictionnary as we will overwrite some of its containt")
-    d=d.copy()
+    d = d.copy()
-    d["unroll_bsize"]=unroll_bsize
+    d["unroll_bsize"] = unroll_bsize
-    d["unroll_ksize"]=unroll_ksize
+    d["unroll_ksize"] = unroll_ksize
-    def my_dup(st,size):
-        s=""
+    def my_dup(st, size):
+        s = ""
        for i in xrange(size):
-            d["unroll_iter"]=i
+            d["unroll_iter"] = i
-            s+=st%d
+            s += st % d
-        return s+"\n"
+        return s + "\n"
    def my_dup2(st):
-        s=""
+        s = ""
-        iter=0
+        iter = 0
        for i in xrange(unroll_bsize):
-            d["unroll_biter"]=i
+            d["unroll_biter"] = i
            for j in xrange(unroll_ksize):
-                d["unroll_kiter"]=j
+                d["unroll_kiter"] = j
-                d["unroll_iter"]=iter
+                d["unroll_iter"] = iter
-                iter+=1
+                iter += 1
-                s+=st%d
+                s += st % d
-        return s+"\n"
+        return s + "\n"
    ret = """
 const int mode=%(mode)s;
 int typenum=0, typenum_f=0;
@@ -1765,7 +1906,8 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
          int pos_n=iter_n*%(self_dy)s;
        """ % d
-    ret += my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
+    ret += my_dup(
+        "%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
    ret += """
          // Sum over kernel, if index into image is out of bounds

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -27,7 +27,7 @@ from theano.tensor.nnet import (categorical_crossentropy,
                                softmax_with_bias, SoftmaxGrad,
                                Prepend_scalar_constant_to_each_row,
                                Prepend_scalar_to_each_row)
-from theano.tensor import dmatrix, dvector, lvector, dscalar
+from theano.tensor import matrix, vector, lvector, scalar
 class T_sigmoid(unittest.TestCase):
@@ -71,8 +71,8 @@ class T_Softmax(utt.InferShapeTester):
        utt.verify_grad(f, [numpy.random.rand(3, 4)])
    def test_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
-        admat_val = numpy.random.rand(3, 4)
+        admat_val = numpy.random.rand(3, 4).astype(config.floatX)
        self._compile_and_check([admat], [Softmax()(admat)],
                            [admat_val], Softmax)
@@ -136,10 +136,10 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
        #print f.maker.fgraph.toposort()
    def test_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
-        advec = dvector()
+        advec = vector()
-        admat_val = numpy.random.rand(3, 4)
+        admat_val = numpy.random.rand(3, 4).astype(config.floatX)
-        advec_val = numpy.random.rand(4)
+        advec_val = numpy.random.rand(4).astype(config.floatX)
        self._compile_and_check([admat, advec],
                            [SoftmaxWithBias()(admat, advec)],
                            [admat_val, advec_val], SoftmaxWithBias)
@@ -148,10 +148,10 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
 class T_SoftmaxGrad(utt.InferShapeTester):
    def test_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
-        bdmat = dmatrix()
+        bdmat = matrix()
-        admat_val = numpy.random.rand(3, 4)
+        admat_val = numpy.random.rand(3, 4).astype(config.floatX)
-        bdmat_val = numpy.random.rand(3, 4)
+        bdmat_val = numpy.random.rand(3, 4).astype(config.floatX)
        self._compile_and_check([admat, bdmat], [SoftmaxGrad()(admat, bdmat)],
                            [admat_val, bdmat_val], SoftmaxGrad)
@@ -218,13 +218,13 @@ class T_CrossentropySoftmax1HotWithBiasDx(utt.InferShapeTester):
        utt.verify_grad(f, [rng.rand(10)])
    def test_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
-        advec = dvector()
+        advec = vector()
        alvec = lvector()
        rng = numpy.random.RandomState(utt.fetch_seed())
-        admat_val = rng.rand(10, 5)
+        admat_val = rng.rand(10, 5).astype(config.floatX)
        admat_val /= admat_val.sum(axis=1).reshape(10, 1)
-        advec_val = rng.rand(10)
+        advec_val = rng.rand(10).astype(config.floatX)
        alvec_val = rng.randint(low=0, high=5, size=10)
        self._compile_and_check([advec, admat, alvec],
                    [CrossentropySoftmax1HotWithBiasDx()(advec, admat, alvec)],
@@ -258,12 +258,12 @@ class T_CrossentropySoftmaxArgmax1HotWithBias(utt.InferShapeTester):
                    numpy.random.rand(n_classes)])
    def test_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
-        advec = dvector()
+        advec = vector()
        alvec = lvector()
        rng = numpy.random.RandomState(utt.fetch_seed())
-        admat_val = rng.rand(3, 5)
+        admat_val = rng.rand(3, 5).astype(config.floatX)
-        advec_val = rng.rand(5)
+        advec_val = rng.rand(5).astype(config.floatX)
        alvec_val = rng.randint(low=0, high=5, size=3)
        self._compile_and_check([admat, advec, alvec],
                CrossentropySoftmaxArgmax1HotWithBias()(admat, advec, alvec),
@@ -293,11 +293,11 @@ class T_prepend(utt.InferShapeTester):
        self.assertTrue(numpy.all(my[:, 0] == 5.0))
    def test_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
-        adscal = dscalar()
+        adscal = scalar()
        rng = numpy.random.RandomState(utt.fetch_seed())
-        admat_val = rng.rand(3, 5)
+        admat_val = rng.rand(3, 5).astype(config.floatX)
-        adscal_val = rng.rand()
+        adscal_val = numpy.asarray(rng.rand(), dtype=config.floatX).item()
        self._compile_and_check([admat],
                   [Prepend_scalar_constant_to_each_row(adscal_val)(admat)],
                    [admat_val],
@@ -312,12 +312,12 @@ class T_prepend(utt.InferShapeTester):
 class T_CrossentropyCategorical1HotGrad(utt.InferShapeTester):
    def test_infer_shape(self):
-        advec = dvector()
+        advec = vector()
-        admat = dmatrix()
+        admat = matrix()
        alvec = lvector()
        rng = numpy.random.RandomState(utt.fetch_seed())
-        advec_val = rng.rand(3)
+        advec_val = rng.rand(3).astype(config.floatX)
-        admat_val = rng.rand(3, 2)
+        admat_val = rng.rand(3, 2).astype(config.floatX)
        alvec_val = [0, 1, 0]
        self._compile_and_check([advec, admat, alvec],
                    [CrossentropyCategorical1HotGrad()(advec, admat, alvec)],
@@ -345,10 +345,10 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        # see issue gh-788
    def est_infer_shape(self):
-        admat = dmatrix()
+        admat = matrix()
        alvec = lvector()
        rng = numpy.random.RandomState(utt.fetch_seed())
-        admat_val = rng.rand(3, 2)
+        admat_val = rng.rand(3, 2).astype(config.floatX)
        alvec_val = [0, 1, 0]
        self._compile_and_check([admat, alvec],
                    [CrossentropyCategorical1Hot()(admat, alvec)],
@@ -570,11 +570,11 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.randn(3, 5)
+        x_val = rng.randn(3, 5).astype(config.floatX)
-        b_val = rng.randn(5)
+        b_val = rng.randn(5).astype(config.floatX)
        y_val = numpy.asarray([2, 4, 1])
-        x = T.dmatrix('x')
+        x = T.matrix('x')
-        b = T.dvector('b')
+        b = T.vector('b')
        y = T.lvector('y')
        ## Basic case
@@ -696,11 +696,11 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.randn(3, 5)
+        x_val = rng.randn(3, 5).astype(config.floatX)
-        b_val = rng.randn(5)
+        b_val = rng.randn(5).astype(config.floatX)
        y_val = numpy.asarray([2, 4, 1], dtype='int64')
-        x = T.dmatrix('x')
+        x = T.matrix('x')
-        b = T.dvector('b')
+        b = T.vector('b')
        y = T.lvector('y')
        yi = T.cast(y, 'int32')
        expressions = [
@@ -739,10 +739,10 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.randn(5)
+        x_val = rng.randn(5).astype(config.floatX)
        y_val = numpy.asarray([2])
-        x = T.dvector('x')
+        x = T.vector('x')
        y = T.lvector('y')
        def print_graph(func):
@@ -788,12 +788,12 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.randn(5)
+        x_val = rng.randn(5).astype(config.floatX)
-        b_val = rng.randn(5)
+        b_val = rng.randn(5).astype(config.floatX)
        y_val = numpy.asarray([2])
-        x = T.dvector('x')
+        x = T.vector('x')
-        b = T.dvector('b')
+        b = T.vector('b')
        y = T.lvector('y')
        def print_graph(func):
@@ -850,13 +850,13 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.randn(3, 5)
+        x_val = rng.randn(3, 5).astype(config.floatX)
-        b_val = rng.randn(5)
+        b_val = rng.randn(5).astype(config.floatX)
        y_val = numpy.asarray([2, 4, 1])
-        x = T.dmatrix('x')
+        x = T.matrix('x')
-        b = T.dvector('b')
+        b = T.vector('b')
        y = T.lvector('y')
-        a = T.dscalar('a')
+        a = T.scalar('a')
        def print_graph(func):
            for i, node in enumerate(func.maker.fgraph.toposort()):
@@ -951,7 +951,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
 def test_argmax_pushdown():
-    x = tensor.dmatrix()
+    x = tensor.matrix()
    #test that the max_and_argmax is pushed down if the max is not used
    out = tensor.max_and_argmax(
@@ -969,7 +969,7 @@ def test_argmax_pushdown():
    assert len(fgraph.toposort()) == 2  # an output_guard is second
    assert fgraph.toposort()[0].op == tensor.basic._max_and_argmax
    assert str(fgraph.toposort()[1].op) == 'OutputGuard'
-    x = tensor.dmatrix()
+    x = tensor.matrix()
    #test that the max_and_argmax is not pushed down if the max is used
    out = tensor.max_and_argmax(
            softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
@@ -998,8 +998,8 @@ def test_argmax_pushdown():
 def test_argmax_pushdown_bias():
-    x = tensor.dmatrix()
+    x = tensor.matrix()
-    b = tensor.dvector()
+    b = tensor.vector()
    out = tensor.argmax(softmax_with_bias(x, b), axis=-1)
    fgraph = gof.FunctionGraph(
@@ -1018,8 +1018,8 @@ def test_argmax_pushdown_bias():
    assert isinstance(fgraph.toposort()[2].op, tensor.MaxAndArgmax)
    assert str(fgraph.toposort()[3].op) == 'OutputGuard'
-    x = tensor.dmatrix()
+    x = tensor.matrix()
-    b = tensor.dvector()
+    b = tensor.vector()
    out = tensor.max_and_argmax(softmax_with_bias(x, b), axis=-1)[0]
    fgraph = gof.FunctionGraph(
            [x, b],
@@ -1068,8 +1068,8 @@ def test_asymptotic_32():
            for i, n in enumerate(f.maker.fgraph.toposort()):
                print i, n
-        xval = numpy.zeros((5, 5), dtype=dtype)
+        xval = numpy.zeros((5, 5), dtype=dtype).astype(dtype)
-        x2val = numpy.zeros(5, dtype=xval.dtype)
+        x2val = numpy.zeros(5, dtype=xval.dtype).astype(dtype)
        for i in xrange(100):
            cval, gxval = f(xval, numpy.arange(5), x2val)
            xval -= 100.3 * gxval
@@ -1185,7 +1185,25 @@ class Test_softmax_opt:
    # REPEAT 3 CASES in presence of log(softmax) with the advanced indexing
    # etc.
+def test_stabilize_log_softmax():
+    mode = theano.compile.mode.get_default_mode()
+    mode = mode.including('local_log_softmax', 'specialize')
+    x = matrix()
+    y = theano.tensor.nnet.softmax(x)
+    z = theano.tensor.log(y)
+    f = theano.function([x], z, mode=mode)
+    #check that the softmax has been optimized out
+    for node in f.maker.fgraph.toposort():
+        assert not isinstance(node.op, y.owner.op.__class__)
+    #call the function so debug mode can verify the optimized
+    #version matches the unoptimized version
+    rng = numpy.random.RandomState([2012, 8, 22])
+    f(numpy.cast[config.floatX](rng.randn(2, 3)))
 if __name__ == '__main__':
    unittest.main()
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -963,6 +963,9 @@ class ShapeFeature(object):
        for sh_idx, sh in enumerate(o_shapes):
            if sh is None:
                continue
+            if not isinstance(sh, (list, tuple)):
+                raise ValueError("infer_shape of %s didn't return a list of"
+                                 " list. It returned '%s'" % (str(node), str(o_shapes)))
            for i, d in enumerate(sh):
                # Note: we ignore any shape element that is not typed (i.e.,
                # does not have a 'dtype' attribute). This means there may

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -4456,6 +4456,7 @@ class T_op_cache(unittest.TestCase):
        a = rand(5,2).astype(config.floatX)
        self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a)))
 class T_reshape(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()
@@ -4469,10 +4470,10 @@ class T_reshape(unittest.TestCase):
        c = reshape(b, as_tensor_variable(6), ndim=1)
        f = inplace_func([b], c)
-        b_val1 = numpy.asarray([[0,1,2],[3,4,5]])
+        b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]])
-        c_val1 = numpy.asarray([0,1,2,3,4,5])
+        c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5])
        b_val2 = b_val1.T
-        c_val2 = numpy.asarray([0,3,1,4,2,5])
+        c_val2 = numpy.asarray([0, 3, 1, 4, 2, 5])
        f_out1 = f(b_val1)
        f_out2 = f(b_val2)
@@ -4484,78 +4485,88 @@ class T_reshape(unittest.TestCase):
        #basic to 1 dim(with list)
        c = reshape(b, (as_tensor_variable(6),), ndim=1)
        f = inplace_func([b], c)
-        assert numpy.all(f(numpy.asarray([[0,1,2],[3,4,5]])) == numpy.asarray([0,1,2,3,4,5]))
+        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
+                         numpy.asarray([0, 1, 2, 3, 4, 5]))
        #print f.maker.fgraph.toposort()
        #check that we remove the useless reshape
        #basic to shape object of same ndim
-        c = reshape(b,d.shape)
+        c = reshape(b, d.shape)
-        f = inplace_func([b,d], c)
+        f = inplace_func([b, d], c)
-        assert numpy.all(f(numpy.asarray([[0,1,2],[3,4,5]]),[[0,1],[2,3],[4,5]]) == numpy.asarray([[0,1],[2,3],[4,5]]))
+        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]),
+                           [[0, 1], [2, 3], [4, 5]]) ==
+                         numpy.asarray([[0, 1], [2, 3], [4, 5]]))
        #basic to 2 dims
-        c = reshape(a, [2,3])
+        c = reshape(a, [2, 3])
        f = inplace_func([a], c)
-        assert numpy.all(f(numpy.asarray([0,1,2,3,4,5])) == numpy.asarray([[0,1,2], [3,4,5]]))
+        assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) ==
+                         numpy.asarray([[0, 1, 2], [3, 4, 5]]))
        #test that it works without inplace operations
-        a_val = numpy.asarray([0,1,2,3,4,5])
+        a_val = numpy.asarray([0, 1, 2, 3, 4, 5])
-        a_val_copy = numpy.asarray([0,1,2,3,4,5])
+        a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5])
-        b_val = numpy.asarray([[0,1,2],[3,4,5]])
+        b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]])
-        f_sub = inplace_func([a,b], c-b)
+        f_sub = inplace_func([a, b], c - b)
        assert numpy.all(f_sub(a_val, b_val) == 0.0)
        assert numpy.all(a_val == a_val_copy)
        #test that it works with inplace operations
-        a_val = theano._asarray([0,1,2,3,4,5], dtype='float64')
+        a_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
-        a_val_copy = theano._asarray([0,1,2,3,4,5], dtype='float64')
+        a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
-        b_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float64')
+        b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
-        f_sub = inplace_func([a,b], c-b)
+        f_sub = inplace_func([a, b], c - b)
        assert numpy.all(f_sub(a_val, b_val) == 0.0)
        assert numpy.all(a_val == a_val_copy)
        # verify gradient
        def just_vals(v):
-            return Reshape(2)(v, theano._asarray([2,3], dtype='int32'))
+            return Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
        utt.verify_grad(just_vals, [a_val])
        #test infer_shape
-        f_sub = function([a,b], (c-b).shape)
+        f_sub = function([a, b], (c - b).shape)
-        if config.mode=="FAST_COMPILE":
+        if config.mode == "FAST_COMPILE":
-            assert len(f_sub.maker.fgraph.toposort())==3
+            assert len(f_sub.maker.fgraph.toposort()) == 3
        else:
            topo = f_sub.maker.fgraph.toposort()
-            assert len(topo)==1
+            assert len(topo) == 1
            topo[0].op == theano.compile.function_module.deep_copy_op
            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5]]))==[2,3])#work in FAST_RUN, but fail on other!
            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5],[6,7]]))==[2,3])#work in FAST_RUN, but fail on other!
        # test broadcast flag for constant value of 1
-        c = reshape(b, (b.shape[0],b.shape[1],1))
+        c = reshape(b, (b.shape[0], b.shape[1], 1))
        f = inplace_func([b], c)
-        assert numpy.all(f(numpy.asarray([[0,1,2],[3,4,5]])) == numpy.asarray([[[0],[1],[2]],[[3],[4],[5]]]))
+        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
-        assert f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable==(False, False, True)
+                         numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]]))
+        assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable ==
+                (False, False, True))
-        assert numpy.all(f_sub(a_val,b_val)==[2,3])
+        assert numpy.all(f_sub(a_val, b_val) == [2, 3])
-    def test_infer_shape(self):
+    def test_bad_shape(self):
        a = matrix('a')
        shapes = ivector('shapes')
        ndim = 2
+        rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        a_val = rng.uniform(size=(3, 4)).astype(config.floatX)
-        r = a.reshape(shapes, ndim=2)
+        #Test reshape to 1 dim
+        r = a.reshape(shapes, ndim=1)
        z = zeros_like(r)
        f = function([a, shapes], z.shape)
+        self.assertRaises(ValueError, f, a_val, [13])
-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        #Test reshape to 1 dim
-        a_val = rng.uniform(size=(3, 4)).astype(config.floatX)
+        r = a.reshape(shapes, ndim=2)
+        z = zeros_like(r)
+        f = function([a, shapes], z.shape)
-        self.assertTrue((f(a_val, [4, 3]) == [4, 3]).all())
-        self.assertTrue((f(a_val, [-1, 3]) == [4, 3]).all())
-        self.assertTrue((f(a_val, [4, -1]) == [4, 3]).all())
        self.assertRaises(ValueError, f, a_val, [-1, 5])
        self.assertRaises(ValueError, f, a_val, [7, -1])
        self.assertRaises(ValueError, f, a_val, [7, 5])
@@ -4577,8 +4588,8 @@ def test_flatten_outdimNone():
    a = dmatrix()
    c = flatten(a)
    f = inplace_func([a], c)
-    a_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float64')
+    a_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
-    c_val = theano._asarray([0,1,2,3,4,5], dtype='float64')
+    c_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
    assert numpy.all(f(a_val)==c_val)
    f = inplace_func([a], c)
    assert numpy.all(f(a_val)==c_val)
@@ -4601,8 +4612,8 @@ def test_flatten_outdim1():
    a = dmatrix()
    c = flatten(a, 1)
    f = inplace_func([a], c)
-    a_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float64')
+    a_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
-    c_val = theano._asarray([0,1,2,3,4,5], dtype='float64')
+    c_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
    assert numpy.all(f(a_val)==c_val)
    f = inplace_func([a], c)
    assert numpy.all(f(a_val)==c_val)
@@ -4613,7 +4624,7 @@ def test_flatten_outdim2():
    a = dmatrix()
    c = flatten(a, 2)
    f = inplace_func([a], c)
-    a_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float64')
+    a_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
    assert numpy.all(f(a_val)==a_val)
    f = inplace_func([a], c)
    assert numpy.all(f(a_val)==a_val)
@@ -6679,8 +6690,17 @@ class TestInferShape(utt.InferShapeTester):
        # (non-constant) input shape
        admat = dmatrix()
        aivec = ivector()
-        ndim = 2
+        ndim = 1
        admat_val = rand(3, 4)
+        self._compile_and_check([admat],
+                                [Reshape(ndim)(admat, [12])],
+                                [admat_val], Reshape)
+        self._compile_and_check([admat],
+                                [Reshape(ndim)(admat, [-1])],
+                                [admat_val], Reshape)
+        ndim = 2
        self._compile_and_check([admat],
                                [Reshape(ndim)(admat, [4, 3])],
                                [admat_val], Reshape)
@@ -6689,6 +6709,17 @@ class TestInferShape(utt.InferShapeTester):
                                [Reshape(ndim)(admat, [4, -1])],
                                [admat_val], Reshape)
+        self._compile_and_check([admat],
+                                [Reshape(ndim)(admat, [3, -1])],
+                                [admat_val], Reshape)
+        self._compile_and_check([admat],
+                                [Reshape(ndim)(admat, [-1, 3])],
+                                [admat_val], Reshape)
+        self._compile_and_check([admat],
+                                [Reshape(ndim)(admat, [-1, 4])],
+                                [admat_val], Reshape)
        # enable when infer_shape is generalized:
        # self._compile_and_check([admat, aivec],
        #                        [Reshape(ndim)(admat, aivec)],

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -132,25 +132,6 @@ class test_dimshuffle_lift(unittest.TestCase):
            "{x,0,1}(y)), z)]"), str(g))
-def test_stabilize_log_softmax():
-    mode = theano.compile.mode.get_default_mode()
-    mode = mode.including('local_log_softmax')
-    x = matrix()
-    y = theano.tensor.nnet.softmax(x)
-    z = theano.tensor.log(y)
-    f = function([x],z)
-    #check that the softmax has been optimized out
-    for node in f.maker.fgraph.toposort():
-        assert not isinstance(node.op, y.owner.op.__class__)
-    #call the function so debug mode can verify the optimized
-    #version matches the unoptimized version
-    rng = numpy.random.RandomState([2012,8,22])
-    f(numpy.cast[config.floatX](rng.randn(2,3)))
 def test_add_canonizer_problem0():
    n_segments = 10
    label = lscalar('label')

--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -115,15 +115,9 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile):
    # Setting aside current working directory for later saving
    sav_dir = os.getcwd()
-    if len(argv) == 1:
+    # The first argument is the called script.
-        tests_dir = theano.__path__[0]
+    argv = argv[1:]
-        other_args = []
-    else:
-        # tests_dir should be at the end of argv, there can be other arguments
-        tests_dir = argv[-1]
-        other_args = argv[1:-1]
-        assert os.path.isdir(tests_dir)
-    os.chdir(tests_dir)
    # It seems safer to fully regenerate the list of tests on each call.
    if os.path.isfile('.noseids'):
        os.remove('.noseids')
@@ -142,7 +136,7 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile):
    python = sys.executable
    rval = subprocess.call(
        ([python, theano_nose, '--collect-only', '--with-id']
-         + other_args),
+         + argv),
        stdin=dummy_in.fileno(),
        stdout=stdout.fileno(),
        stderr=stderr.fileno())
@@ -172,7 +166,7 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile):
            rval = subprocess.call(
                ([python, theano_nose, '-q', '--with-id']
                 + map(str, test_range)
-                 + other_args),
+                 + argv),
                stdout=dummy_out.fileno(),
                stderr=dummy_out.fileno(),
                stdin=dummy_in.fileno())
@@ -198,7 +192,7 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile):
            subprocess.call(
                ([python, theano_nose, '-v', '--with-id']
                 + failed
-                 + other_args),
+                 + argv),
                stdin=dummy_in.fileno(),
                stdout=stdout.fileno(),
                stderr=stderr.fileno())
@@ -240,7 +234,7 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile):
                                                 n_tests + 1)):
                proc = subprocess.Popen(
                    ([python, theano_nose, '-v', '--with-id']
-                    + [str(test_id)] + other_args +
+                    + [str(test_id)] + argv +
                     ['--disabdocstring']),
                    # the previous option calls a custom Nosetests plugin
                    # precluding automatic sustitution of doc. string for