Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu

Merge pull request #3476 from abergeron/move_config
645557f9 · Pascal Lamblin · 1ec1cd9b · 71dea2cf · 645557f9 · 645557f9
--- a/setup.cfg
+++ b/setup.cfg
 [nosetest]
 match=^test
 nocapture=1
+
+[flake8]
+ignore=E501,E123,E133
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):

        theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()

-if config.device.startswith('cuda') or config.device.startswith('opencl') or \
-        config.gpuarray.init_device != '':
+if (config.device.startswith('cuda') or
+        config.device.startswith('opencl') or
+        config.init_gpu_device.startswith('cuda') or
+        config.init_gpu_device.startswith('opencl')):
    import theano.sandbox.gpuarray

 # Use config.numpy to call numpy.seterr

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
        self.default = default

        def filter(val):
-            if val.startswith('cpu') or val.startswith('gpu') \
+            if val == self.default or val.startswith('gpu') \
                    or val.startswith('opencl') or val.startswith('cuda'):
                return val
            else:
                raise ValueError(('Invalid value ("%s") for configuration '
                                  'variable "%s". Valid options start with '
-                                  'one of "cpu", "gpu", "opencl", "cuda"'
-                                  % (val, self.fullname)))
+                                  'one of "%s", "gpu", "opencl", "cuda"'
+                                  % (self.default, val, self.fullname)))
        over = kwargs.get("allow_override", True)
        super(DeviceParam, self).__init__(default, filter, over)

    def __str__(self):
-        return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
+        return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)

 AddConfigVar(
    'device',
@@ -94,15 +94,7 @@ AddConfigVar(
     "on it. Do not use upper case letters, only lower case even if "
     "NVIDIA use capital letters."),
    DeviceParam('cpu', allow_override=False),
-    in_c_key=False,)
-
-AddConfigVar('gpuarray.init_device',
-             """
-             Device to initialize for gpuarray use without moving
-             computations automatically.
-             """,
-             StrParam(''),
-             in_c_key=False)
+    in_c_key=False)

 AddConfigVar(
    'init_gpu_device',
@@ -110,12 +102,7 @@ AddConfigVar(
     "Unlike 'device', setting this option will NOT move computations, "
     "nor shared variables, to the specified GPU. "
     "It can be used to run GPU-specific tests on a particular GPU."),
-    EnumStr('', 'gpu',
-            'gpu0', 'gpu1', 'gpu2', 'gpu3',
-            'gpu4', 'gpu5', 'gpu6', 'gpu7',
-            'gpu8', 'gpu9', 'gpu10', 'gpu11',
-            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-            allow_override=False),
+    DeviceParam('', allow_override=False),
    in_c_key=False)

 AddConfigVar(
@@ -131,6 +118,112 @@ AddConfigVar(
    in_c_key=False)


+def default_cuda_root():
+    v = os.getenv('CUDA_ROOT', "")
+    if v:
+        return v
+    s = os.getenv("PATH")
+    if not s:
+        return ''
+    for dir in s.split(os.path.pathsep):
+        if os.path.exists(os.path.join(dir, "nvcc")):
+            return os.path.split(dir)[0]
+    return ''
+
+AddConfigVar(
+    'cuda.root',
+    """directory with bin/, lib/, include/ for cuda utilities.
+       This directory is included via -L and -rpath when linking
+       dynamically compiled modules.  If AUTO and nvcc is in the
+       path, it will use one of nvcc parent directory.  Otherwise
+       /usr/local/cuda will be used.  Leave empty to prevent extra
+       linker directives.  Default: environment variable "CUDA_ROOT"
+       or else "AUTO".
+       """,
+    StrParam(default_cuda_root),
+    in_c_key=False)
+
+
+def filter_nvcc_flags(s):
+    assert isinstance(s, str)
+    flags = [flag for flag in s.split(' ') if flag]
+    if any([f for f in flags if not f.startswith("-")]):
+        raise ValueError(
+            "Theano nvcc.flags support only parameter/value pairs without"
+            " space between them. e.g.: '--machine 64' is not supported,"
+            " but '--machine=64' is supported. Please add the '=' symbol."
+            " nvcc.flags value is '%s'" % s)
+    return ' '.join(flags)
+
+AddConfigVar('nvcc.flags',
+             "Extra compiler flags for nvcc",
+             ConfigParam("", filter_nvcc_flags),
+             # Not needed in c key as it is already added.
+             # We remove it as we don't make the md5 of config to change
+             # if theano.sandbox.cuda is loaded or not.
+             in_c_key=False)
+
+AddConfigVar('nvcc.compiler_bindir',
+             "If defined, nvcc compiler driver will seek g++ and gcc"
+             " in this directory",
+             StrParam(""),
+             in_c_key=False)
+
+AddConfigVar('nvcc.fastmath',
+             "",
+             BoolParam(False),
+             # Not needed in c key as it is already added.
+             # We remove it as we don't make the md5 of config to change
+             # if theano.sandbox.cuda is loaded or not.
+             in_c_key=False)
+
+AddConfigVar('gpuarray.sync',
+             """If True, every op will make sure its work is done before
+                returning.  Setting this to True will slow down execution,
+                but give much more accurate results in profiling.""",
+             BoolParam(False),
+             in_c_key=True)
+
+AddConfigVar('dnn.conv.workmem',
+             "This flag is deprecated; use dnn.conv.algo_fwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.workmem_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_fwd',
+             "Default implementation to use for CuDNN forward convolution.",
+             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_bwd',
+             "Default implementation to use for CuDNN backward convolution.",
+             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+
+def default_dnn_path(suffix):
+    def f(suffix=suffix):
+        if config.cuda.root == '':
+            return ''
+        return os.path.join(config.cuda.root, suffix)
+    return f
+
+AddConfigVar('dnn.include_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(default_dnn_path('include')))
+
+AddConfigVar('dnn.library_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(default_dnn_path('lib64')))
+
 # This flag determines whether or not to raise error/warning message if
 # there is a CPU Op in the computational graph.
 AddConfigVar(

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -102,7 +102,7 @@ def change_flags(**kwargs):
                l = [v for v in theano.configparser._config_var_list
                     if v.fullname == k]
                assert len(l) == 1
-                old_val[k] = l[0].__get__()
+                old_val[k] = l[0].__get__(True, None)
            try:
                for k in kwargs:
                    l = [v for v in theano.configparser._config_var_list
@@ -167,7 +167,7 @@ def _config_print(thing, buf):
    for cv in _config_var_list:
        print(cv, file=buf)
        print("    Doc: ", cv.doc, file=buf)
-        print("    Value: ", cv.__get__(), file=buf)
+        print("    Value: ", cv.__get__(True, None), file=buf)
        print("", file=buf)


@@ -182,7 +182,7 @@ def get_config_md5():
    all_opts = sorted([c for c in _config_var_list if c.in_c_key],
                      key=lambda cv: cv.fullname)
    return theano.gof.utils.hash_from_code('\n'.join(
-        ['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts]))
+        ['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))


 class TheanoConfigParser(object):
@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
        # Trigger a read of the value from config files and env vars
        # This allow to filter wrong value from the user.
        if not callable(configparam.default):
-            configparam.__get__()
+            configparam.__get__(root, type(root))
        else:
            # We do not want to evaluate now the default value
            # when it is a callable.
            try:
                fetch_val_for_key(configparam.fullname)
                # The user provided a value, filter it now.
-                configparam.__get__()
+                configparam.__get__(root, type(root))
            except KeyError:
                pass
        setattr(root.__class__, sections[0], configparam)
@@ -294,6 +294,7 @@ class ConfigParam(object):
        self.default = default
        self.filter = filter
        self.allow_override = allow_override
+        self.is_default = True
        # N.B. --
        # self.fullname  # set by AddConfigVar
        # self.doc       # set by AddConfigVar
@@ -304,16 +305,19 @@ class ConfigParam(object):
        # Calling `filter` here may actually be harmful if the default value is
        # invalid and causes a crash or has unwanted side effects.

-    def __get__(self, *args):
+    def __get__(self, cls, type_):
+        if cls is None:
+            return self
        if not hasattr(self, 'val'):
            try:
                val_str = fetch_val_for_key(self.fullname)
+                self.is_default = False
            except KeyError:
                if callable(self.default):
                    val_str = self.default()
                else:
                    val_str = self.default
-            self.__set__(None, val_str)
+            self.__set__(cls, val_str)
        # print "RVAL", self.val
        return self.val


--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -1171,7 +1171,7 @@ def apply_meth(tag):
            code = self.code_sections[tag]

            define_macros, undef_macros = self.get_c_macros(node, name)
-            return os.linesep.join([define_macros, code,
+            return os.linesep.join(['', define_macros, code,
                                    undef_macros])
        else:
            raise utils.MethodNotDefined(
@@ -1428,7 +1428,7 @@ class COp(Op):
            def_macros, undef_macros = self.get_c_macros(node, name)
            def_sub, undef_sub = self.get_sub_macros(sub)

-            return os.linesep.join([def_macros, def_sub,
+            return os.linesep.join(['', def_macros, def_sub,
                                    op_code,
                                    undef_sub, undef_macros])
        else:
@@ -1442,17 +1442,21 @@ class COp(Op):
            define_macros, undef_macros = self.get_c_macros(node, name,
                                                            check_input=False)

+            ctx = ""
+            if 'context' in sub:
+                ctx = ", %s" % (sub['context'],)
+
            # Generate the C code
            return """
                %(define_macros)s
                {
-                  if (%(func_name)s(%(func_args)s) != 0) {
+                  if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
                    %(fail)s
                  }
                }
                %(undef_macros)s
                """ % dict(func_name=self.func_name,
-                           fail=sub['fail'],
+                           fail=sub['fail'], ctx=ctx,
                           func_args=self.format_c_function_args(inp, out),
                           define_macros=define_macros,
                           undef_macros=undef_macros)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
 # import dependency. So we also test it in the file theano/__init__.py
 if config.device.startswith('gpu'):
    use(device=config.device, force=config.force_device, test_driver=False)
-elif config.init_gpu_device:
+elif config.init_gpu_device.startswith('gpu'):
    assert config.device == "cpu", (
        "We can use the Theano flag init_gpu_device"
        " only when the Theano flag device=='cpu'")

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

-import theano.sandbox.dnn_flags
-

 def dnn_available():
    if dnn_available.avail is None:
@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  return 1;
 }
 """
+            params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+            if config.dnn.include_path:
+                params.append("-I" + config.dnn.include_path)
+            if config.dnn.library_path:
+                params.append("-L" + config.dnn.library_path)
            # Do not run here the test program. It would run on the
            # default gpu, not the one selected by the user. If mixed
            # GPU are installed or if the GPUs are configured in
            # exclusive mode, this cause bad detection.
            comp, out, err = NVCC_compiler.try_flags(
-                ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-                 "-I" + config.dnn.include_path,
-                 "-L" + config.dnn.library_path],
-                preambule=preambule, body=body,
+                params=params, preambule=preambule, body=body,
                try_run=False, output=True)

            dnn_available.avail = comp

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -8,6 +8,7 @@ import warnings

 import numpy

+from theano import config
 from theano.compat import decode, decode_iter
 from theano.gof import local_bitwidth
 from theano.gof.utils import hash_from_file
@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen

 _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")

-from theano.configparser import (config, AddConfigVar, StrParam,
-                                 BoolParam, ConfigParam)
-
-AddConfigVar('nvcc.compiler_bindir',
-             "If defined, nvcc compiler driver will seek g++ and gcc"
-             " in this directory",
-             StrParam(""),
-             in_c_key=False)
-
-user_provided_cuda_root = True
-
-
-def default_cuda_root():
-    global user_provided_cuda_root
-    v = os.getenv('CUDA_ROOT', "")
-    user_provided_cuda_root = False
-    if v:
-        return v
-    return find_cuda_root()
-
-AddConfigVar('cuda.root',
-        """directory with bin/, lib/, include/ for cuda utilities.
-        This directory is included via -L and -rpath when linking
-        dynamically compiled modules.  If AUTO and nvcc is in the
-        path, it will use one of nvcc parent directory.  Otherwise
-        /usr/local/cuda will be used.  Leave empty to prevent extra
-        linker directives.  Default: environment variable "CUDA_ROOT"
-        or else "AUTO".
-        """,
-        StrParam(default_cuda_root),
-        in_c_key=False)
-
-
-def filter_nvcc_flags(s):
-    assert isinstance(s, str)
-    flags = [flag for flag in s.split(' ') if flag]
-    if any([f for f in flags if not f.startswith("-")]):
-        raise ValueError(
-            "Theano nvcc.flags support only parameter/value pairs without"
-            " space between them. e.g.: '--machine 64' is not supported,"
-            " but '--machine=64' is supported. Please add the '=' symbol."
-            " nvcc.flags value is '%s'" % s)
-    return ' '.join(flags)
-
-AddConfigVar('nvcc.flags',
-             "Extra compiler flags for nvcc",
-             ConfigParam("", filter_nvcc_flags),
-             # Not needed in c key as it is already added.
-             # We remove it as we don't make the md5 of config to change
-             # if theano.sandbox.cuda is loaded or not.
-             in_c_key=False)
-
-
-AddConfigVar('nvcc.fastmath',
-             "",
-             BoolParam(False),
-             # Not needed in c key as it is already added.
-             # We remove it as we don't make the md5 of config to change
-             # if theano.sandbox.cuda is loaded or not.
-             in_c_key=False)
-
 nvcc_path = 'nvcc'
 nvcc_version = None

@@ -115,14 +55,6 @@ def is_nvcc_available():
            return False


-def find_cuda_root():
-    s = os.getenv("PATH")
-    if not s:
-        return
-    for dir in s.split(os.path.pathsep):
-        if os.path.exists(os.path.join(dir, "nvcc")):
-            return os.path.split(dir)[0]
-
 rpath_defaults = []


@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler):
        include_dirs
            A list of include directory names (each gets prefixed with -I).
        lib_dirs
-            A list of library search path directory names (each gets 
+            A list of library search path directory names (each gets
            prefixed with -L).
        libs
            A list of libraries to link with (each gets prefixed with -l).
@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
        # provided an cuda.root flag, we need to add one, but
        # otherwise, we don't add it. See gh-1540 and
        # https://wiki.debian.org/RpathIssue for details.
-        if (user_provided_cuda_root and
+
+        if (not type(config.cuda).root.is_default and
            os.path.exists(os.path.join(config.cuda.root, 'lib'))):

            rpaths.append(os.path.join(config.cuda.root, 'lib'))

--- a/theano/sandbox/dnn_flags.py
+++ b/theano/sandbox/dnn_flags.py
-"""
-This module contains the configuration flags for cudnn support.
-
-Those are shared between the cuda and gpuarray backend which is why
-they are in this file.
-"""
-import os.path
-
-from theano.configparser import AddConfigVar, EnumStr, StrParam
-from theano import config
-
-AddConfigVar('dnn.conv.workmem',
-             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.workmem_bwd',
-             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_fwd',
-             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_bwd',
-             "Default implementation to use for CuDNN backward convolution.",
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-AddConfigVar('dnn.include_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
-
-AddConfigVar('dnn.library_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -19,13 +19,6 @@ try:
 except ImportError:
    pygpu = None

-AddConfigVar('gpuarray.sync',
-             """If True, every op will make sure its work is done before
-                returning.  Setting this to True will slow down execution,
-                but give much more accurate results in profiling.""",
-             BoolParam(False),
-             in_c_key=True)
-
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)
@@ -57,8 +50,9 @@ if pygpu:
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
-        elif config.gpuarray.init_device != '':
-            init_dev(config.gpuarray.init_device)
+        elif (config.init_gpu_device.startswith('cuda') or
+              config.init_gpu_device.startswith('opencl')):
+            init_dev(config.init_gpu_device)

        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
@@ -70,7 +64,8 @@ if pygpu:
    except Exception:
        error("Could not initialize pygpu, support disabled", exc_info=True)
 else:
-    if (config.gpuarray.init_device != '' or
-        config.device.startswith('opencl') or
-        config.device.startswith('cuda')):
+    if (config.init_gpu_device.startswith('cuda') or
+            config.init_gpu_device.startswith('opencl') or
+            config.device.startswith('opencl') or
+            config.device.startswith('cuda')):
        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -5,17 +5,15 @@ import theano
 from theano import config, gof

 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass

-from six.moves import reduce
-from .comp import NVCC_compiler
 from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
 from theano.gof import utils

+
 class GpuConv(GpuKernelBase, gof.Op):
    """
    Implement the batched and stacked 2d convolution on the gpu.
@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op):
        raise ValueError(mode)

    def __init__(self, border_mode,
-            subsample=(1, 1),
-            logical_img_hw=None,
-            logical_kern_hw=None,
-            logical_kern_align_top=True,
-            version=-1,
-            direction_hint=None,
-            verbose=0,
-            kshp=None,
-            imshp=None,
-            max_threads_dim0=None,
-            nkern=None,
-            bsize=None,
-            fft_opt=True):
+                 subsample=(1, 1),
+                 logical_img_hw=None,
+                 logical_kern_hw=None,
+                 logical_kern_align_top=True,
+                 version=-1,
+                 direction_hint=None,
+                 verbose=0,
+                 kshp=None,
+                 imshp=None,
+                 max_threads_dim0=None,
+                 nkern=None,
+                 bsize=None,
+                 fft_opt=True):
        self.border_mode = border_mode
        self.subsample = subsample
        if logical_img_hw is not None:
@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op):
    def flops(self, inputs, outputs):
        """
        Useful with the hack in profilemode to print the MFlops.
-        
+
        """
        images, kerns = inputs
        out, = outputs
@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
        nb = 0
        if self.kshp is not None:
            nb = self.kshp[1]
-        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
+        return ['-DTHEANO_KERN_WID=' + str(nb)]

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['<stdint.h>', '<stdio.h>', 'cuda.h',
-                '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
+        return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 21)
-
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
+        return (0, 22)

    def c_code(self, node, nodename, inp, out_, sub):
        img, kern = inp

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -26,10 +26,7 @@ from .conv import GpuConv
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
 from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
-from .opt_util import alpha_merge, output_merge
-
-# We need to import this to define the flags.
-from theano.sandbox import dnn_flags  # noqa
+from .opt_util import alpha_merge, output_merge, inplace_allocempty


 def dnn_available():
@@ -50,7 +47,6 @@ def dnn_available():
        dnn_available.avail = False
    preambule = """
 #include <stdio.h>
-#include <cuda.h>
 #include <cudnn.h>
 #include <cudnn_helper.h>
 """
@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  return 1;
 }
 """
+
+    params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+    if config.dnn.include_path:
+        params.append("-I" + config.dnn.include_path)
+    if config.dnn.library_path:
+        params.append("-L" + config.dnn.library_path)
    # Do not run here the test program. It would run on the
    # default gpu, not the one selected by the user. If mixed
    # GPU are installed or if the GPUs are configured in
    # exclusive mode, this cause bad detection.
    comp, out, err = GCC_compiler.try_flags(
-        ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-         "-I" + config.dnn.include_path,
-         "-L" + config.dnn.library_path],
-        preambule=preambule, body=body,
+        params, preambule=preambule, body=body,
        try_run=False, output=True)

    dnn_available.avail = comp
@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
                       'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')


-@local_optimizer([GpuDnnConv], inplace=True)
-def local_dnn_conv_inplace(node):
-    if type(node.op) != GpuDnnConv or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+@inplace_allocempty(GpuDnnConv, 2)
+def local_dnn_conv_inplace(node, inputs):
    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]


-@local_optimizer([GpuDnnConvGradW], inplace=True)
-def local_dnn_convgw_inplace(node):
-    if type(node.op) != GpuDnnConvGradW or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+@inplace_allocempty(GpuDnnConvGradW, 2)
+def local_dnn_convgw_inplace(node, inputs):
    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]


-@local_optimizer([GpuDnnConvGradI], inplace=True)
-def local_dnn_convgi_inplace(node):
-    if type(node.op) != GpuDnnConvGradI or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+@inplace_allocempty(GpuDnnConvGradI, 2)
+def local_dnn_convgi_inplace(node, inputs):
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]

 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,
                                 local_dnn_convgi_inplace,
-                                 name="local_dnn_conv_inplace"),
+                                 name="local_dnna_conv_inplace"),
               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')


 @register_opt('cudnn')
-@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
 def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
 def local_dnn_convw_alpha_merge(node, *inputs):
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
 def local_dnn_convi_alpha_merge(node, *inputs):
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
 """

 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass

+
 def nvcc_kernel(name, params, body):
    """
    Return the c code of a kernel function.
@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):

    """
    ctype = gpuarray.dtype_to_ctype(dtype)
-    return [
-            # get max of buf (trashing all but buf[0])
-            inline_reduce_max(N, buf, threadPos, threadCount),
+    # get max of buf (trashing all but buf[0])
+    return [inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
-                  '; __i+=' + threadCount + '){',
-                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-                buf2 + '[__i] = ' + buf + '[__i]',
+            '; __i+=' + threadCount + '){',
+            buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
+            buf2 + '[__i] = ' + buf + '[__i]',
            '}',
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
-                  '; __i+=' + threadCount + '){',
-                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+            '; __i+=' + threadCount + '){',
+            buf + '[__i] = ' + buf2 + '[__i] / row_sum',
            '}',
            '__syncthreads()',
            ]
@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
        Optional, the dtype of the output.
    manner_fn
        A function that accepts strings of arguments a and b, and returns c code
-        for their reduction. 
+        for their reduction.
        Example: return "%(a)s + %(b)s" for a sum reduction.
    manner_init
        A function that accepts strings of arguments a and return c code for its
@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
        loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
                                                 locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
-                          "%s[i]" % buf)
+                           "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,

    Parameters
    ----------
-    N 
+    N
        Length of the buffer, atleast waprSize(32).
    buf
        A shared memory buffer of size warpSize * sizeof(dtype).

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
-import os
 import numpy

 from theano import Op, Apply, config
-from theano.gof import local_optimizer
 from theano.tensor.nnet.neighbours import Images2Neibs
 import theano.tensor as T

 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass

-from .basic_ops import (as_gpuarray_variable,
-                        host_from_gpu, gpu_from_host,
-                        GpuKernelBase, Kernel)
+from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType
-from .comp import NVCC_compiler


 class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                   dtype=ten4.type.dtype)()])

    def c_code_cache_version(self):
-        return (10,1)
+        return (11,)

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):


 @opt.register_opt()
-@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
 def local_gemm16_alpha_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


 @opt.register_opt()
-@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
 def local_gemm16_output_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):


 @register_opt()
-@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
 def local_gpuagemm_alpha_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]


 @register_opt()
-@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
 def local_gpuagemm_output_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]


--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    def _f16_ok(self):
        return self.iadd_node.op._f16_ok

-    def c_header_dirs(self):
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-
    def c_headers(self):
        return self.iadd_node.op.c_headers()

-    def c_compiler(self):
-        return self.iadd_node.op.c_compiler()
-
    def c_init_code(self):
        return self.iadd_node.op.c_init_code()

@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        elemwise_version = self.iadd_node.c_code_cache_version()
        if not parent_version or not elemwise_version:
            return
-        return parent_version + elemwise_version + (2,)
+        return parent_version + elemwise_version + (3,)


 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
 import unittest
 from theano.compat import izip
-from copy import copy, deepcopy

 from six import iteritems

@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
 # Don't import test classes otherwise they get tested as part of the file
 from theano.tensor.tests import test_basic
 from theano.tensor.tests.test_basic import rand, safe_make_node
+from theano.tests import unittest_tools as utt
 from theano.tests.unittest_tools import SkipTest

 import theano.sandbox.gpuarray

+from ..type import (GpuArrayType,
+                    gpuarray_shared_constructor)
+from ..basic_ops import (
+    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
+    gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
+    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+from ..subtensor import GpuSubtensor
+
+import theano.sandbox.cuda as cuda_ndarray
+
+try:
+    from pygpu import gpuarray
+except:
+    pass
+
 if theano.sandbox.gpuarray.pygpu is None:
    raise SkipTest("pygpu not installed")

 # If you are writing a new test file, don't copy this code, but rather
 # import stuff from this file (like mode_with_gpu) to reuse it.
-import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
    if not cuda_ndarray.use.device_number:
        # We should not enable all the use like the flag device=gpu,
@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
 if not theano.sandbox.gpuarray.pygpu_activated:
    raise SkipTest("pygpu disabled")

-from ..type import (GpuArrayType,
-                    gpuarray_shared_constructor)
-from ..basic_ops import (
-    host_from_gpu, gpu_from_host,
-    gpu_alloc, GpuAlloc,
-    GpuAllocEmpty,
-    gpu_from_cuda,
-    cuda_from_gpu, HostFromGpu,
-    GpuContiguous,
-    GpuFromHost, GpuReshape,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
-from ..subtensor import GpuSubtensor
-
-from theano.tests import unittest_tools as utt
 utt.seed_rng()
 rng = numpy.random.RandomState(seed=utt.fetch_seed())

-from pygpu import gpuarray
-
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
@@ -63,22 +61,6 @@ else:
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')


-def may_fail(msg, EClass):
-    """Mark a test that requires very specific conditions to work to
-       mask a specific exception class."""
-    def test_decorator(f):
-        def wrapper():
-            try:
-                f()
-            except Exception as e:
-                if isinstance(e, EClass):
-                    raise SkipTest(msg, e)
-                raise
-        wrapper.__name__ = f.__name__
-        return wrapper
-    return test_decorator
-
-
 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
    if mode is None:
@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                    else:
                        err_msg = ("Test %s::%s: exception raised during test "
                                   "call was not the same as the reference "
-                                   "call (got: %s, expected %s)") % \
+                                   "call (got: %s, expected %s)" %
                                   (self.gpu_op, testname, type(exc),
-                                    type(ref_e))
+                                    type(ref_e)))
                        exc.args += (err_msg,)
                        raise

@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                                                        expected):
                    self.fail(("Test %s::%s: Output %s gave the wrong "
                               "value. With inputs %s, expected %s "
-                               "(dtype %s), got %s (dtype %s).") % (
-                            self.op, testname, i, inputs, expected,
-                            expected.dtype, variable, variable.dtype))
+                               "(dtype %s), got %s (dtype %s)." %
+                               (self.op, testname, i, inputs, expected,
+                                expected.dtype, variable, variable.dtype)))

            for description, check in iteritems(self.checks):
                if not check(inputs, variables):
@@ -250,36 +232,6 @@ def test_transfer_strided():
    assert numpy.all(fv == av)


-@may_fail("Op fails if both contexts are not the same and it's rare "
-          "that the tests will be run this way", ValueError)
-def test_transfer_cuda_gpu():
-    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available is False:
-        raise SkipTest("Can't test interaction with cuda if cuda not present")
-    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
-
-    av = theano._asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
-    cv = cuda_ndarray.CudaNdarray(av)
-    gvs = gv[:, ::-2]
-    cvs = cv[:, ::-2]
-
-    f = theano.function([c], gpu_from_cuda(c))
-    fv = f(cv)
-    assert GpuArrayType.values_eq_approx(fv, gv)
-
-    fvs = f(cvs)
-    assert GpuArrayType.values_eq_approx(fvs, gvs)
-
-    f = theano.function([g], cuda_from_gpu(g))
-    fv = f(gv)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
-
-    fvs = f(gvs)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
-
-
 def gpu_alloc_expected(x, *shp):
    g = gpuarray.empty(shp, dtype=x.dtype)
    g[:] = x
@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
    gpu_op=gpu_alloc,
    cases=dict(
        correct01=(rand(), numpy.int32(7)),
-# just gives a DeepCopyOp with possibly wrong results on the CPU
-#        correct01_bcast=(rand(1), numpy.int32(7)),
+        # just gives a DeepCopyOp with possibly wrong results on the CPU
+        # correct01_bcast=(rand(1), numpy.int32(7)),
        correct02=(rand(), numpy.int32(4), numpy.int32(7)),
        correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
        correct13=(rand(7), numpy.int32(2), numpy.int32(4),
@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
                          dtype='float32')

-    gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
-    host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
    f = theano.function([a], gpu_from_host(a), mode=m)
    assert gpu_from_host in [x.op
                             for x in f.maker.fgraph.toposort()]

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
@@ -6,8 +6,7 @@ import numpy
 import theano
 from theano import tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
-                                _dot22)
+from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
 from theano.tensor.tests.test_blas import TestGer, BaseGemv

 from .. import gpuarray_shared_constructor
@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
                             mode_with_gpu)

 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
-                    gpugemm_inplace, gpugemm_no_inplace,
+                    gpugemm_inplace,
                    gpuger_inplace, gpuger_no_inplace,
                    GpuGer, gpu_dot22, GpuGemm)


-GpuGemvTester = makeTester('GpuGemvTester',
-                           op=gemv_inplace, gpu_op=gpugemv_inplace,
-                           cases=dict(
-        dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
-        dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
-#        test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
-#        test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
-#        test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
-        test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
-        )
-)
+GpuGemvTester = makeTester(
+    'GpuGemvTester',
+    op=gemv_inplace, gpu_op=gpugemv_inplace,
+    cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
+               dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
+               # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
+               # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
+               # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
+               test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
+               )
+    )


 class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
            return theano.shared(val)


-GpuGemmTester = makeTester('GpuGemmTester',
-                           op=gemm_inplace, gpu_op=gpugemm_inplace,
-                           cases=dict(
-        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
-        test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
-        test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
-        test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
-        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
-        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
-        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
-        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
- #       test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
- #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
- #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
- #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-        )
-)
+GpuGemmTester = makeTester(
+    'GpuGemmTester',
+    op=gemm_inplace, gpu_op=gpugemm_inplace,
+    cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
+               test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
+               test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
+               test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
+               test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
+               test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
+               test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
+               test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
+               test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
+               # test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
+               # test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
+               # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
+               # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
+               )
+    )


 class TestGpuSger(TestGer):
@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):

    def test_f32_0_0(self):
        raise SkipTest('0-sized objects not supported')
+
    def test_f32_1_0(self):
        raise SkipTest('0-sized objects not supported')
+
    def test_f32_0_1(self):
        raise SkipTest('0-sized objects not supported')

@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):


 GpuDot22Tester = makeTester(
-    'GpuGemmTester',
+    'GpuDot22Tester',
    op=_dot22, gpu_op=gpu_dot22,
    cases=dict(
        test1=[rand(3, 4), rand(4, 5)],
        test2=[rand(1, 4), rand(4, 5)],
        test3=[rand(3, 1), rand(1, 5)],
        test4=[rand(3, 4), rand(4, 1)],
-#        test5=[rand(0, 4), rand(4, 5)],
-#        test6=[rand(3, 0), rand(0, 5)],
-#        test7=[rand(3, 4), rand(4, 0)],
-#        test8=[rand(0, 4), rand(4, 0)],
-#        test9=[rand(0, 0), rand(0, 0)],
+        # test5=[rand(0, 4), rand(4, 5)],
+        # test6=[rand(3, 0), rand(0, 5)],
+        # test7=[rand(3, 4), rand(4, 0)],
+        # test8=[rand(0, 4), rand(4, 0)],
+        # test9=[rand(0, 0), rand(0, 0)],
    )
 )

+
 def test_hgemm_swap():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
@@ -149,6 +151,7 @@ def test_hgemm_swap():

    utt.assert_allclose(of, on)

+
 def test_hgemm_alpha_output_merge():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
-import unittest

 from theano.tensor.nnet.tests import test_neighbours
 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
-                             mode_without_gpu)
+from .test_basic_ops import mode_with_gpu

 from ..neighbours import GpuImages2Neibs


--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
 from __future__ import print_function
-from nose.plugins.skip import SkipTest
+
 import numpy
 import unittest

@@ -7,8 +7,6 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt

-from theano.sandbox import gpuarray
-
 # We let that import do the init of the back-end if needed.
 from .test_basic_ops import (mode_with_gpu,
                             mode_without_gpu)
@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
        n_in = 4098
        n_out = 4099

-    x = T.fmatrix('x')
    y = T.lvector('y')

    b = T.fvector('b')
-    #W = T.fmatrix('W')

    # we precompute the dot with big shape before to allow the test of
    # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
-    #(the launch timed out and was terminated) on GPU card not
+    # (the launch timed out and was terminated) on GPU card not
    # powerful enough. We need the big shape to check for corner
    # case.
    dot_result = T.fmatrix('dot_result')
@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():

    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
                       dtype=numpy.float32)
-    #?????yy = numpy.ones((batch_size,),dtype='float32')
    yy = numpy.ones((batch_size,), dtype='int32')
    b_values = numpy.zeros((n_out,), dtype='float32')
    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    classify_gpu = theano.function(inputs=[y, b, dot_result],
                                   outputs=[loss, y_pred, dW],
                                   mode=mode_with_gpu)
-    # theano.printing.debugprint(classify)
-    # theano.printing.debugprint(classify_gpu)

    assert any([isinstance(node.op,
                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    We check that we loop when their is too much threads

    """
-    n_in = 1000
    batch_size = 4097
    n_out = 1250

    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
-        n_in = 4098
        n_out = 4099

    # Seed numpy.random with config.unittests.rseed
@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():

    rtol = 1e-5
    atol = 1e-6
-    if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
-        abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
-        scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
-        max_i = scaled_err.argmax()
-
-        print('max err index:', max_i, max_i / batch_size, end=' ')
-        print(max_i % batch_size, max_i / n_out, max_i & n_out)
-        print('At that index:')
-        print('err:', scaled_err.flatten()[max_i])
-        print('absolute error:', abs_err.flatten()[max_i])
-        print('relative error:', rel_err.flatten()[max_i])
-        print('cpu_out:', cpu_out.flatten()[max_i])
-        print('gpu_out:', gpu_out.flatten()[max_i])
-        print('softmax_output_value:', softmax_output_value.flatten()[max_i])
-        print('dnll_value:', dnll_value[max_i / n_out])
-        print('y_idx_value:', y_idx_value[max_i / n_out])
-
-        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
-            rtol, atol)
+    utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)


 def test_softmax_with_bias_float16():
@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
    softmax_with_bias_unittest_template(dtypeInput='float32',
                                        dtypeBias='float16')

+
 def test_softmax_with_bias_float32():
    softmax_with_bias_unittest_template(dtypeInput='float32',
                                        dtypeBias='float32')
@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):

    TODO: check that we loop when there are too many threads. (THIS IS
    NOT IMPLEMENTED)
+
    """
    x = T.matrix('x', dtype=dtypeInput)
    b = T.vector('b', dtype=dtypeBias)
@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
 def test_softmax_float16():
    softmax_unittest_template('float16')

+
 def test_softmax_float32():
    softmax_unittest_template('float32')

+
 def test_softmax_float64():
    softmax_unittest_template('float64')


--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
-import operator
-
 import numpy

 import theano
@@ -25,7 +23,6 @@ def test_deep_copy():

 def test_values_eq_approx():
    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
    assert GpuArrayType.values_eq_approx(a, a)
    b = a.copy()
    b[0] = numpy.asarray(b[0]) + 1.

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -184,7 +184,7 @@ class GpuArrayType(Type):
    @staticmethod
    def may_share_memory(a, b):
        if (not isinstance(a, gpuarray.GpuArray) or
-               not isinstance(b, gpuarray.GpuArray)):
+                not isinstance(b, gpuarray.GpuArray)):
            return False
        return pygpu.gpuarray.may_share_memory(a, b)

@@ -200,11 +200,12 @@ class GpuArrayType(Type):
                self.broadcastable == other.broadcastable)

    def convert_variable(self, var):
-        if (type(self) == type(var.type) and
-                self.typecode == var.type.typecode and
-                self.ndim == var.type.ndim and
+        vt = var.type
+        if (type(self) == type(vt) and
+                self.typecode == vt.typecode and
+                self.ndim == vt.ndim and
                all(sb == ob or ob for sb, ob in zip(self.broadcastable,
-                                                     var.type.broadcastable))):
+                                                     vt.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)

    def __hash__(self):

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -157,24 +157,11 @@ whitelist_flake8 = [
    "sandbox/linalg/ops.py",
    "sandbox/linalg/__init__.py",
    "sandbox/linalg/tests/test_linalg.py",
-    "sandbox/gpuarray/basic_ops.py",
-    "sandbox/gpuarray/nnet.py",
-    "sandbox/gpuarray/elemwise.py",
-    "sandbox/gpuarray/type.py",
    "sandbox/gpuarray/__init__.py",
-    "sandbox/gpuarray/kernel_codegen.py",
-    "sandbox/gpuarray/conv.py",
-    "sandbox/gpuarray/neighbours.py",
    "sandbox/gpuarray/tests/test_subtensor.py",
    "sandbox/gpuarray/tests/test_scan.py",
-    "sandbox/gpuarray/tests/test_neighbours.py",
-    "sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
-    "sandbox/gpuarray/tests/test_type.py",
    "sandbox/gpuarray/tests/test_opt.py",
-    "sandbox/gpuarray/tests/test_blas.py",
    "sandbox/gpuarray/tests/test_elemwise.py",
-    "sandbox/gpuarray/tests/test_nnet.py",
-    "sandbox/gpuarray/tests/test_basic_ops.py",
    "scan_module/scan_utils.py",
    "scan_module/scan_views.py",
    "scan_module/scan.py",