Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu

Merge pull request #3476 from abergeron/move_config
645557f9 · Pascal Lamblin · 1ec1cd9b · 71dea2cf · 645557f9 · 645557f9
--- a/setup.cfg
+++ b/setup.cfg
 [nosetest]
 match=^test
 nocapture=1
+
+[flake8]
+ignore=E501,E123,E133
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):

        theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()

-if config.device.startswith('cuda') or config.device.startswith('opencl') or \
-        config.gpuarray.init_device != '':
+if (config.device.startswith('cuda') or
+        config.device.startswith('opencl') or
+        config.init_gpu_device.startswith('cuda') or
+        config.init_gpu_device.startswith('opencl')):
    import theano.sandbox.gpuarray

 # Use config.numpy to call numpy.seterr

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
        self.default = default

        def filter(val):
-            if val.startswith('cpu') or val.startswith('gpu') \
+            if val == self.default or val.startswith('gpu') \
                    or val.startswith('opencl') or val.startswith('cuda'):
                return val
            else:
                raise ValueError(('Invalid value ("%s") for configuration '
                                  'variable "%s". Valid options start with '
-                                  'one of "cpu", "gpu", "opencl", "cuda"'
-                                  % (val, self.fullname)))
+                                  'one of "%s", "gpu", "opencl", "cuda"'
+                                  % (self.default, val, self.fullname)))
        over = kwargs.get("allow_override", True)
        super(DeviceParam, self).__init__(default, filter, over)

    def __str__(self):
-        return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
+        return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)

 AddConfigVar(
    'device',
@@ -94,15 +94,7 @@ AddConfigVar(
     "on it. Do not use upper case letters, only lower case even if "
     "NVIDIA use capital letters."),
    DeviceParam('cpu', allow_override=False),
-    in_c_key=False,)
-
-AddConfigVar('gpuarray.init_device',
-             """
-             Device to initialize for gpuarray use without moving
-             computations automatically.
-             """,
-             StrParam(''),
-             in_c_key=False)
+    in_c_key=False)

 AddConfigVar(
    'init_gpu_device',
@@ -110,12 +102,7 @@ AddConfigVar(
     "Unlike 'device', setting this option will NOT move computations, "
     "nor shared variables, to the specified GPU. "
     "It can be used to run GPU-specific tests on a particular GPU."),
-    EnumStr('', 'gpu',
-            'gpu0', 'gpu1', 'gpu2', 'gpu3',
-            'gpu4', 'gpu5', 'gpu6', 'gpu7',
-            'gpu8', 'gpu9', 'gpu10', 'gpu11',
-            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-            allow_override=False),
+    DeviceParam('', allow_override=False),
    in_c_key=False)

 AddConfigVar(
@@ -131,6 +118,112 @@ AddConfigVar(
    in_c_key=False)


+def default_cuda_root():
+    v = os.getenv('CUDA_ROOT', "")
+    if v:
+        return v
+    s = os.getenv("PATH")
+    if not s:
+        return ''
+    for dir in s.split(os.path.pathsep):
+        if os.path.exists(os.path.join(dir, "nvcc")):
+            return os.path.split(dir)[0]
+    return ''
+
+AddConfigVar(
+    'cuda.root',
+    """directory with bin/, lib/, include/ for cuda utilities.
+       This directory is included via -L and -rpath when linking
+       dynamically compiled modules.  If AUTO and nvcc is in the
+       path, it will use one of nvcc parent directory.  Otherwise
+       /usr/local/cuda will be used.  Leave empty to prevent extra
+       linker directives.  Default: environment variable "CUDA_ROOT"
+       or else "AUTO".
+       """,
+    StrParam(default_cuda_root),
+    in_c_key=False)
+
+
+def filter_nvcc_flags(s):
+    assert isinstance(s, str)
+    flags = [flag for flag in s.split(' ') if flag]
+    if any([f for f in flags if not f.startswith("-")]):
+        raise ValueError(
+            "Theano nvcc.flags support only parameter/value pairs without"
+            " space between them. e.g.: '--machine 64' is not supported,"
+            " but '--machine=64' is supported. Please add the '=' symbol."
+            " nvcc.flags value is '%s'" % s)
+    return ' '.join(flags)
+
+AddConfigVar('nvcc.flags',
+             "Extra compiler flags for nvcc",
+             ConfigParam("", filter_nvcc_flags),
+             # Not needed in c key as it is already added.
+             # We remove it as we don't make the md5 of config to change
+             # if theano.sandbox.cuda is loaded or not.
+             in_c_key=False)
+
+AddConfigVar('nvcc.compiler_bindir',
+             "If defined, nvcc compiler driver will seek g++ and gcc"
+             " in this directory",
+             StrParam(""),
+             in_c_key=False)
+
+AddConfigVar('nvcc.fastmath',
+             "",
+             BoolParam(False),
+             # Not needed in c key as it is already added.
+             # We remove it as we don't make the md5 of config to change
+             # if theano.sandbox.cuda is loaded or not.
+             in_c_key=False)
+
+AddConfigVar('gpuarray.sync',
+             """If True, every op will make sure its work is done before
+                returning.  Setting this to True will slow down execution,
+                but give much more accurate results in profiling.""",
+             BoolParam(False),
+             in_c_key=True)
+
+AddConfigVar('dnn.conv.workmem',
+             "This flag is deprecated; use dnn.conv.algo_fwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.workmem_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_fwd',
+             "Default implementation to use for CuDNN forward convolution.",
+             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_bwd',
+             "Default implementation to use for CuDNN backward convolution.",
+             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+
+def default_dnn_path(suffix):
+    def f(suffix=suffix):
+        if config.cuda.root == '':
+            return ''
+        return os.path.join(config.cuda.root, suffix)
+    return f
+
+AddConfigVar('dnn.include_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(default_dnn_path('include')))
+
+AddConfigVar('dnn.library_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(default_dnn_path('lib64')))
+
 # This flag determines whether or not to raise error/warning message if
 # there is a CPU Op in the computational graph.
 AddConfigVar(

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -102,7 +102,7 @@ def change_flags(**kwargs):
                l = [v for v in theano.configparser._config_var_list
                     if v.fullname == k]
                assert len(l) == 1
-                old_val[k] = l[0].__get__()
+                old_val[k] = l[0].__get__(True, None)
            try:
                for k in kwargs:
                    l = [v for v in theano.configparser._config_var_list
@@ -167,7 +167,7 @@ def _config_print(thing, buf):
    for cv in _config_var_list:
        print(cv, file=buf)
        print("    Doc: ", cv.doc, file=buf)
-        print("    Value: ", cv.__get__(), file=buf)
+        print("    Value: ", cv.__get__(True, None), file=buf)
        print("", file=buf)


@@ -182,7 +182,7 @@ def get_config_md5():
    all_opts = sorted([c for c in _config_var_list if c.in_c_key],
                      key=lambda cv: cv.fullname)
    return theano.gof.utils.hash_from_code('\n'.join(
-        ['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts]))
+        ['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))


 class TheanoConfigParser(object):
@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
        # Trigger a read of the value from config files and env vars
        # This allow to filter wrong value from the user.
        if not callable(configparam.default):
-            configparam.__get__()
+            configparam.__get__(root, type(root))
        else:
            # We do not want to evaluate now the default value
            # when it is a callable.
            try:
                fetch_val_for_key(configparam.fullname)
                # The user provided a value, filter it now.
-                configparam.__get__()
+                configparam.__get__(root, type(root))
            except KeyError:
                pass
        setattr(root.__class__, sections[0], configparam)
@@ -294,6 +294,7 @@ class ConfigParam(object):
        self.default = default
        self.filter = filter
        self.allow_override = allow_override
+        self.is_default = True
        # N.B. --
        # self.fullname  # set by AddConfigVar
        # self.doc       # set by AddConfigVar
@@ -304,16 +305,19 @@ class ConfigParam(object):
        # Calling `filter` here may actually be harmful if the default value is
        # invalid and causes a crash or has unwanted side effects.

-    def __get__(self, *args):
+    def __get__(self, cls, type_):
+        if cls is None:
+            return self
        if not hasattr(self, 'val'):
            try:
                val_str = fetch_val_for_key(self.fullname)
+                self.is_default = False
            except KeyError:
                if callable(self.default):
                    val_str = self.default()
                else:
                    val_str = self.default
-            self.__set__(None, val_str)
+            self.__set__(cls, val_str)
        # print "RVAL", self.val
        return self.val


--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -1171,7 +1171,7 @@ def apply_meth(tag):
            code = self.code_sections[tag]

            define_macros, undef_macros = self.get_c_macros(node, name)
-            return os.linesep.join([define_macros, code,
+            return os.linesep.join(['', define_macros, code,
                                    undef_macros])
        else:
            raise utils.MethodNotDefined(
@@ -1428,7 +1428,7 @@ class COp(Op):
            def_macros, undef_macros = self.get_c_macros(node, name)
            def_sub, undef_sub = self.get_sub_macros(sub)

-            return os.linesep.join([def_macros, def_sub,
+            return os.linesep.join(['', def_macros, def_sub,
                                    op_code,
                                    undef_sub, undef_macros])
        else:
@@ -1442,17 +1442,21 @@ class COp(Op):
            define_macros, undef_macros = self.get_c_macros(node, name,
                                                            check_input=False)

+            ctx = ""
+            if 'context' in sub:
+                ctx = ", %s" % (sub['context'],)
+
            # Generate the C code
            return """
                %(define_macros)s
                {
-                  if (%(func_name)s(%(func_args)s) != 0) {
+                  if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
                    %(fail)s
                  }
                }
                %(undef_macros)s
                """ % dict(func_name=self.func_name,
-                           fail=sub['fail'],
+                           fail=sub['fail'], ctx=ctx,
                           func_args=self.format_c_function_args(inp, out),
                           define_macros=define_macros,
                           undef_macros=undef_macros)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
 # import dependency. So we also test it in the file theano/__init__.py
 if config.device.startswith('gpu'):
    use(device=config.device, force=config.force_device, test_driver=False)
-elif config.init_gpu_device:
+elif config.init_gpu_device.startswith('gpu'):
    assert config.device == "cpu", (
        "We can use the Theano flag init_gpu_device"
        " only when the Theano flag device=='cpu'")

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

-import theano.sandbox.dnn_flags
-

 def dnn_available():
    if dnn_available.avail is None:
@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  return 1;
 }
 """
+            params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+            if config.dnn.include_path:
+                params.append("-I" + config.dnn.include_path)
+            if config.dnn.library_path:
+                params.append("-L" + config.dnn.library_path)
            # Do not run here the test program. It would run on the
            # default gpu, not the one selected by the user. If mixed
            # GPU are installed or if the GPUs are configured in
            # exclusive mode, this cause bad detection.
            comp, out, err = NVCC_compiler.try_flags(
-                ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-                 "-I" + config.dnn.include_path,
-                 "-L" + config.dnn.library_path],
-                preambule=preambule, body=body,
+                params=params, preambule=preambule, body=body,
                try_run=False, output=True)

            dnn_available.avail = comp

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -8,6 +8,7 @@ import warnings

 import numpy

+from theano import config
 from theano.compat import decode, decode_iter
 from theano.gof import local_bitwidth
 from theano.gof.utils import hash_from_file
@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen

 _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")

-from theano.configparser import (config, AddConfigVar, StrParam,
-                                 BoolParam, ConfigParam)
-
-AddConfigVar('nvcc.compiler_bindir',
-             "If defined, nvcc compiler driver will seek g++ and gcc"
-             " in this directory",
-             StrParam(""),
-             in_c_key=False)
-
-user_provided_cuda_root = True
-
-
-def default_cuda_root():
-    global user_provided_cuda_root
-    v = os.getenv('CUDA_ROOT', "")
-    user_provided_cuda_root = False
-    if v:
-        return v
-    return find_cuda_root()
-
-AddConfigVar('cuda.root',
-        """directory with bin/, lib/, include/ for cuda utilities.
-        This directory is included via -L and -rpath when linking
-        dynamically compiled modules.  If AUTO and nvcc is in the
-        path, it will use one of nvcc parent directory.  Otherwise
-        /usr/local/cuda will be used.  Leave empty to prevent extra
-        linker directives.  Default: environment variable "CUDA_ROOT"
-        or else "AUTO".
-        """,
-        StrParam(default_cuda_root),
-        in_c_key=False)
-
-
-def filter_nvcc_flags(s):
-    assert isinstance(s, str)
-    flags = [flag for flag in s.split(' ') if flag]
-    if any([f for f in flags if not f.startswith("-")]):
-        raise ValueError(
-            "Theano nvcc.flags support only parameter/value pairs without"
-            " space between them. e.g.: '--machine 64' is not supported,"
-            " but '--machine=64' is supported. Please add the '=' symbol."
-            " nvcc.flags value is '%s'" % s)
-    return ' '.join(flags)
-
-AddConfigVar('nvcc.flags',
-             "Extra compiler flags for nvcc",
-             ConfigParam("", filter_nvcc_flags),
-             # Not needed in c key as it is already added.
-             # We remove it as we don't make the md5 of config to change
-             # if theano.sandbox.cuda is loaded or not.
-             in_c_key=False)
-
-
-AddConfigVar('nvcc.fastmath',
-             "",
-             BoolParam(False),
-             # Not needed in c key as it is already added.
-             # We remove it as we don't make the md5 of config to change
-             # if theano.sandbox.cuda is loaded or not.
-             in_c_key=False)
-
 nvcc_path = 'nvcc'
 nvcc_version = None

@@ -115,14 +55,6 @@ def is_nvcc_available():
            return False


-def find_cuda_root():
-    s = os.getenv("PATH")
-    if not s:
-        return
-    for dir in s.split(os.path.pathsep):
-        if os.path.exists(os.path.join(dir, "nvcc")):
-            return os.path.split(dir)[0]
-
 rpath_defaults = []


@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler):
        include_dirs
            A list of include directory names (each gets prefixed with -I).
        lib_dirs
-            A list of library search path directory names (each gets 
+            A list of library search path directory names (each gets
            prefixed with -L).
        libs
            A list of libraries to link with (each gets prefixed with -l).
@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
        # provided an cuda.root flag, we need to add one, but
        # otherwise, we don't add it. See gh-1540 and
        # https://wiki.debian.org/RpathIssue for details.
-        if (user_provided_cuda_root and
+
+        if (not type(config.cuda).root.is_default and
            os.path.exists(os.path.join(config.cuda.root, 'lib'))):

            rpaths.append(os.path.join(config.cuda.root, 'lib'))

--- a/theano/sandbox/dnn_flags.py
+++ b/theano/sandbox/dnn_flags.py
-"""
-This module contains the configuration flags for cudnn support.
-
-Those are shared between the cuda and gpuarray backend which is why
-they are in this file.
-"""
-import os.path
-
-from theano.configparser import AddConfigVar, EnumStr, StrParam
-from theano import config
-
-AddConfigVar('dnn.conv.workmem',
-             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.workmem_bwd',
-             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_fwd',
-             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_bwd',
-             "Default implementation to use for CuDNN backward convolution.",
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-AddConfigVar('dnn.include_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
-
-AddConfigVar('dnn.library_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -19,13 +19,6 @@ try:
 except ImportError:
    pygpu = None

-AddConfigVar('gpuarray.sync',
-             """If True, every op will make sure its work is done before
-                returning.  Setting this to True will slow down execution,
-                but give much more accurate results in profiling.""",
-             BoolParam(False),
-             in_c_key=True)
-
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)
@@ -57,8 +50,9 @@ if pygpu:
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
-        elif config.gpuarray.init_device != '':
-            init_dev(config.gpuarray.init_device)
+        elif (config.init_gpu_device.startswith('cuda') or
+              config.init_gpu_device.startswith('opencl')):
+            init_dev(config.init_gpu_device)

        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
@@ -70,7 +64,8 @@ if pygpu:
    except Exception:
        error("Could not initialize pygpu, support disabled", exc_info=True)
 else:
-    if (config.gpuarray.init_device != '' or
-        config.device.startswith('opencl') or
-        config.device.startswith('cuda')):
+    if (config.init_gpu_device.startswith('cuda') or
+            config.init_gpu_device.startswith('opencl') or
+            config.device.startswith('opencl') or
+            config.device.startswith('cuda')):
        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -2,11 +2,9 @@ import os

 import numpy

-import theano
-from theano import Op, Apply
-from theano import tensor, scalar, config
+from theano import Op, Apply, Type, Variable
+from theano import tensor, config
 from theano.gradient import grad_undefined
-from theano.scalar import Scalar
 from theano.tensor.basic import Alloc, Join, Split

 from theano.gof import HideC
@@ -17,7 +15,7 @@ from six.moves import xrange

 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass

@@ -293,7 +291,6 @@ class GpuFromHost(Op):
    def perform(self, node, inp, out):
        x, = inp
        z, = out
-        type = node.outputs[0].type
        z[0] = gpuarray.array(x)

    def grad(self, inputs, grads):
@@ -312,254 +309,29 @@ class GpuFromHost(Op):

    def c_code(self, node, name, inputs, outputs, sub):
        return """
+        PyGpuArrayObject *%(name)s_tmp;
+        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
+        if (%(name)s_tmp == NULL)
+          %(fail)s
        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
-                                     get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
-                                     PyArray_NDIM(%(inp)s),
-                                     (size_t *)PyArray_DIMS(%(inp)s),
-                                     (ssize_t *)PyArray_STRIDES(%(inp)s),
+        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
+                                     get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+                                     PyArray_NDIM(%(name)s_tmp),
+                                     (size_t *)PyArray_DIMS(%(name)s_tmp),
+                                     (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
                                     pygpu_default_context(),
                                     Py_None);
-        if (%(out)s == NULL) {
-            %(fail)s
-        }
+
+        Py_DECREF(%(name)s_tmp);
+        if (%(out)s == NULL)
+          %(fail)s
        """ % {'name': name, 'inp': inputs[0],
               'out': outputs[0], 'fail': sub['fail']}

-    def c_code_cache_version(self):
-        return (4,)
-
-gpu_from_host = GpuFromHost()
-
-
-class GpuFromCuda(Op):
-    view_map = {0: [0]}
-    __props__ = ()
-
-    def make_node(self, x):
-        from theano.sandbox.cuda import CudaNdarrayType
-        if not isinstance(x.type, CudaNdarrayType):
-            raise TypeError(x)
-        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
-                                              dtype=x.dtype)()])
-
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        z[0] = gpuarray.array(numpy.asarray(x))
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [cuda_from_gpu(gz)]
-
-    def R_op(self, inputs, eval_points):
-        ev, = eval_points
-        if isinstance(ev, GpuArrayType):
-            return [cuda_from_gpu(ev)]
-        else:
-            return ev
-
-    def infer_shape(self, node, xshp):
-        return xshp
-
-    def c_headers(self):
-        return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>',
-                '<gpuarray/types.h>', '<cuda.h>']
-
-    def c_header_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'include'))
-        return ret
-
-    def c_lib_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'lib'))
-        return ret
-
-    def c_libraries(self):
-        return ['cudart', 'cublas', 'cuda']
-
-    def c_support_code(self):
-        return """
-        CUcontext (*cuda_get_ctx)(void *ctx);
-        gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
-        """
-
-    def c_init_code(self):
-        return ['cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");',
-                'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");']
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        int %(name)serr;
-        gpudata *%(name)sdata;
-        CUcontext %(name)scur;
-        size_t *%(name)sdims;
-        ssize_t *%(name)sstr;
-
-        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
-            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
-            %(fail)s
-        }
-        %(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
-        if (%(name)sdims == NULL) {
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
-            %(fail)s
-        }
-        %(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
-        if (%(name)sstr == NULL) {
-            free(%(name)sdims);
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
-            %(fail)s
-        }
-
-        for (unsigned int i = 0; i < %(in)s->nd; i++) {
-            %(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
-            %(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
-        }
-
-        %(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
-                                     (CUdeviceptr)%(in)s->devdata,
-                                     ((size_t)%(in)s->data_allocated)*4);
-        if (%(name)sdata == NULL) {
-            Py_DECREF(%(out)s);
-            free(%(name)sdims);
-            free(%(name)sstr);
-            PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
-            %(fail)s
-        }
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
-                                    %(name)sdims, %(name)sstr,
-                                    pygpu_default_context(), 1,
-                                    (PyObject *)%(in)s,
-                                    (PyObject *)&PyGpuArrayType);
-        pygpu_default_context()->ops->buffer_release(%(name)sdata);
-        free(%(name)sdims);
-        free(%(name)sstr);
-        if (%(out)s == NULL) {
-            %(fail)s
-        }
-        """ % {'name': name, 'in': inputs[0], 'out': outputs[0],
-               'fail': sub['fail']}
-
    def c_code_cache_version(self):
        return (5,)

-gpu_from_cuda = GpuFromCuda()
-
-
-class CudaFromGpu(Op):
-    view_map = {0: [0]}
-    __props__ = ()
-
-    def make_node(self, x):
-        from theano.sandbox.cuda import CudaNdarrayType
-        if not isinstance(x.type, GpuArrayType):
-            raise TypeError(x)
-        if x.type.dtype != 'float32':
-            raise TypeError(x)
-        return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
-
-    def perform(self, node, inp, out):
-        from theano.sandbox.cuda import filter as cuda_filter
-        x, = inp
-        z, = out
-        z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
-                           tuple([0] * x.ndim), 0, z[0])
-
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [gpu_from_cuda(gz)]
-
-    def R_op(self, inputs, eval_points):
-        from theano.sandbox.cuda import CudaNdarrayType
-        ev, = eval_points
-        if (isinstance(ev, CudaNdarrayType)):
-            return [gpu_from_cuda(ev)]
-        else:
-            return [ev]
-
-    def infer_shape(self, node, shp):
-        return shp
-
-    def c_headers(self):
-        return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>', '<cuda.h>']
-
-    def c_header_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'include'))
-        return ret
-
-    def c_lib_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'lib'))
-        return ret
-
-    def c_libraries(self):
-        return ['cudart', 'cublas', 'cuda']
-
-    def c_support_code(self):
-        return """
-        CUcontext (*cuda_get_ctx)(void *ctx);
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """
-
-    def c_init_code(self):
-        return ['cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");',
-                'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        int %(name)serr = 0, %(name)si;
-        CUcontext %(name)scur;
-
-        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
-            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
-            %(fail)s
-        }
-
-        if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
-            %(fail)s
-        }
-        Py_XDECREF(%(out)s);
-        %(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
-        if (!%(out)s) {
-            %(fail)s
-        }
-        for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
-            CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
-            CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
-        }
-        %(name)serr = CudaNdarray_set_device_data(%(out)s,
-          (float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
-                                          (PyObject *)%(inp)s);
-        if (%(name)serr) {
-           %(fail)s
-        }
-        """ % {'name': name, 'inp': inputs[0], 'out': outputs[0],
-               'fail': sub['fail']}
-
-    def c_code_cache_version(self):
-        return (3,)
-
-
-cuda_from_gpu = CudaFromGpu()
+gpu_from_host = GpuFromHost()


 class GpuAlloc(HideC, Alloc):
@@ -572,7 +344,7 @@ class GpuAlloc(HideC, Alloc):
        value is always 0, so the c code call memset as it is faster.

    """
- 
+
    __props__ = ('memset_0',)
    _f16_ok = True

@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
        sh, bcast = self.validate_shape(shape)
        if value.ndim > len(sh):
            TypeError("The GpuAlloc value to use has more dimensions "
-                      "than the specified shape", v.ndim, len(sh))
+                      "than the specified shape", value.ndim, len(sh))
        otype = value.type.clone(broadcastable=bcast)
        return Apply(self, [value] + sh, [otype()])

@@ -686,29 +458,28 @@ class GpuAlloc(HideC, Alloc):
        return (2,)

    def do_constant_folding(self, node):
+        from . import subtensor, blas
        for client in node.outputs[0].clients:
            if client[0] == 'output':
                # If the output is a constant, it will have to be deepcopied
                # each time the function is called.  So we do not fold.
                return False
-            elif (  # The following ops work inplace of their input id 0.
-                  client[1] == 0 and
-                  isinstance(client[0].op, (
-                    # Ops that will work inplace on the Alloc. So if they
-                    # get constant_folded, they would copy the
-                    # constant and this is less efficients.
-
-                    # Not doing the constant folding could also lower
-                    # the peak memory usage, as we the "constant" won't
-                    # always exists.
-                      # theano.tensor.subtensor.AdvancedIncSubtensor,
-                      theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
-                      theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
-                      theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20,
-                      theano.sandbox.gpuarray.blas.GpuGemm,
-                      theano.sandbox.gpuarray.blas.GpuGemv,
-                      theano.sandbox.gpuarray.blas.GpuGer,
-                  ))):
+            # The following ops work inplace of their input id 0.
+            elif (client[1] == 0 and
+                  # Ops that will work inplace on the Alloc. So if they
+                  # get constant_folded, they would copy the
+                  # constant and this is less efficients.
+
+                  # Not doing the constant folding could also lower
+                  # the peak memory usage, as we the "constant" won't
+                  # always exists.
+                  isinstance(client[0].op,
+                             (subtensor.GpuIncSubtensor,
+                              subtensor.GpuAdvancedIncSubtensor1,
+                              subtensor.GpuAdvancedIncSubtensor1_dev20,
+                              blas.GpuGemm, blas.GpuGemv,
+                              blas.GpuGer)
+                             )):
                return False
            # If the clients is a transfer, we don't want to fold. We
            # let the moving opt finish before deciding what to do.
@@ -795,7 +566,7 @@ class GpuContiguous(Op):
    """
    Always return a c contiguous output. Copy the input only if it is
    not already c contiguous.
- 
+
    """

    __props__ = ()
@@ -980,7 +751,7 @@ class GpuJoin(HideC, Join):
        node = Join.make_node(self, axis, *tensors)

        return Apply(self, [node.inputs[0]] + list(map(as_gpuarray_variable,
-                                                  tensors)),
+                                                       tensors)),
                     [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
                                   dtype=node.outputs[0].dtype)()])

@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
                code=code, name="k",
                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
                flags=Kernel.get_flags(self.dtype),
-                objvar='k_eye_'+name,
-                )]
+                objvar='k_eye_' + name)]

    def c_code(self, node, name, inp, out, sub):
        n, m = inp

--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -5,17 +5,15 @@ import theano
 from theano import config, gof

 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass

-from six.moves import reduce
-from .comp import NVCC_compiler
 from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
 from theano.gof import utils

+
 class GpuConv(GpuKernelBase, gof.Op):
    """
    Implement the batched and stacked 2d convolution on the gpu.
@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op):
        raise ValueError(mode)

    def __init__(self, border_mode,
-            subsample=(1, 1),
-            logical_img_hw=None,
-            logical_kern_hw=None,
-            logical_kern_align_top=True,
-            version=-1,
-            direction_hint=None,
-            verbose=0,
-            kshp=None,
-            imshp=None,
-            max_threads_dim0=None,
-            nkern=None,
-            bsize=None,
-            fft_opt=True):
+                 subsample=(1, 1),
+                 logical_img_hw=None,
+                 logical_kern_hw=None,
+                 logical_kern_align_top=True,
+                 version=-1,
+                 direction_hint=None,
+                 verbose=0,
+                 kshp=None,
+                 imshp=None,
+                 max_threads_dim0=None,
+                 nkern=None,
+                 bsize=None,
+                 fft_opt=True):
        self.border_mode = border_mode
        self.subsample = subsample
        if logical_img_hw is not None:
@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op):
    def flops(self, inputs, outputs):
        """
        Useful with the hack in profilemode to print the MFlops.
-        
+
        """
        images, kerns = inputs
        out, = outputs
@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
        nb = 0
        if self.kshp is not None:
            nb = self.kshp[1]
-        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
+        return ['-DTHEANO_KERN_WID=' + str(nb)]

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['<stdint.h>', '<stdio.h>', 'cuda.h',
-                '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
+        return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 21)
-
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
+        return (0, 22)

    def c_code(self, node, nodename, inp, out_, sub):
        img, kern = inp

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -26,10 +26,7 @@ from .conv import GpuConv
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
 from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
-from .opt_util import alpha_merge, output_merge
-
-# We need to import this to define the flags.
-from theano.sandbox import dnn_flags  # noqa
+from .opt_util import alpha_merge, output_merge, inplace_allocempty


 def dnn_available():
@@ -50,7 +47,6 @@ def dnn_available():
        dnn_available.avail = False
    preambule = """
 #include <stdio.h>
-#include <cuda.h>
 #include <cudnn.h>
 #include <cudnn_helper.h>
 """
@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  return 1;
 }
 """
+
+    params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+    if config.dnn.include_path:
+        params.append("-I" + config.dnn.include_path)
+    if config.dnn.library_path:
+        params.append("-L" + config.dnn.library_path)
    # Do not run here the test program. It would run on the
    # default gpu, not the one selected by the user. If mixed
    # GPU are installed or if the GPUs are configured in
    # exclusive mode, this cause bad detection.
    comp, out, err = GCC_compiler.try_flags(
-        ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-         "-I" + config.dnn.include_path,
-         "-L" + config.dnn.library_path],
-        preambule=preambule, body=body,
+        params, preambule=preambule, body=body,
        try_run=False, output=True)

    dnn_available.avail = comp
@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
                       'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')


-@local_optimizer([GpuDnnConv], inplace=True)
-def local_dnn_conv_inplace(node):
-    if type(node.op) != GpuDnnConv or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+@inplace_allocempty(GpuDnnConv, 2)
+def local_dnn_conv_inplace(node, inputs):
    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]


-@local_optimizer([GpuDnnConvGradW], inplace=True)
-def local_dnn_convgw_inplace(node):
-    if type(node.op) != GpuDnnConvGradW or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+@inplace_allocempty(GpuDnnConvGradW, 2)
+def local_dnn_convgw_inplace(node, inputs):
    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]


-@local_optimizer([GpuDnnConvGradI], inplace=True)
-def local_dnn_convgi_inplace(node):
-    if type(node.op) != GpuDnnConvGradI or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+@inplace_allocempty(GpuDnnConvGradI, 2)
+def local_dnn_convgi_inplace(node, inputs):
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]

 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,
                                 local_dnn_convgi_inplace,
-                                 name="local_dnn_conv_inplace"),
+                                 name="local_dnna_conv_inplace"),
               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')


 @register_opt('cudnn')
-@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
 def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
 def local_dnn_convw_alpha_merge(node, *inputs):
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
 def local_dnn_convi_alpha_merge(node, *inputs):
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
 from __future__ import print_function
 import copy
-import os
 from theano.compat import izip
 import numpy

-import theano
 from theano import Apply, scalar, config
 from theano import scalar as scal
 from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
-from theano.gof.cmodule import GCC_compiler
 from theano.scalar import Scalar
 from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)

@@ -94,7 +91,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        try:
            support_code = self.scalar_op.c_support_code()
            if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
-                support_code.strip() != ""):
+                    support_code.strip() != ""):
                # The macro is fine, the C++ struct is not.
                raise SupportCodeError(support_code)
        except MethodNotDefined:
@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]

        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(node.outputs) if not n in self.inplace_pattern]
+                enumerate(node.outputs) if n not in self.inplace_pattern]
        scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]

        fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            else:
                scal_out.append(arg.name + '[i]')

-        kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
+        kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
                                    scal_in, scal_out,
                                    dict(fail='return;'))

@@ -169,27 +166,12 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                        ("npy_float16", "ga_half"),
                        ("npy_float32", "ga_float"),
                        ("npy_float64", "ga_double"),
-            ]:
+                        ]:
            kop = kop.replace(npy, ga)
-        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-
-    def c_compiler(self):
-        return GCC_compiler
+        return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_support_code(self):
        return self.scalar_op.c_support_code()
@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                                              node.outputs[0].type.dtype),
                       objvar='elem_%d_%s' % (nd, nodename))]

-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
-
    def c_code(self, node, name, inputs, outputs, sub):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
@@ -399,7 +376,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
                                                                       i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
-                                       node.inputs + node.outputs)):
+                                            node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = dtype_to_ctype(var.dtype)
@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                         GpuKernel_error(&%(kname)s, err));
            %(fail)s;
        }
-        """ % dict(kname=kname,fail=fail)
+        """ % dict(kname=kname, fail=fail)
        if config.gpuarray.sync:
            code += """
            err = GpuArray_sync(&%(z)s->ga);
@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (3, ver)
+            return (4, ver)
        else:
            return ver

@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):

        res = input

-        res = res.transpose(self.shuffle+self.drop)
+        res = res.transpose(self.shuffle + self.drop)

        shape = list(res.shape[:len(self.shuffle)])
        for augm in self.augment:
@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
            Py_DECREF(tmp);
            return res;
        }
-        """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
+        """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle + self.drop)),
                   name=name, nd_out=len(self.new_order),
                   copy_shape=copy_shape(len(self.new_order)))

@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        return process

    def c_code_cache_version(self):
-        return (4,)
+        return (5,)


 class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
@@ -581,7 +558,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    pre_scalar_op
        If present, must be a scalar op with only 1 input. We will execute it
        on the input value before reduction.
-    
+
    Examples
    --------
    When scalar_op is a theano.scalar.basic.Add instance:
@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if self.pre_scalar_op:
            # Currently we only tested pre_scalar_op that don't cause
            # upcast.
-            d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
-            assert d1.dtype == ret.outputs[0].dtype
            assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
        if self.reduce_mask is None:
            if self.axis is None:
@@ -687,8 +662,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if (x.type.ndim != len(self.reduce_mask)):
            raise TypeError("x must have rank %i" % len(self.reduce_mask))
        if ("complex" in x.dtype or
-            "complex" in ret.outputs[0].dtype or
-            "complex" in self._acc_dtype(x.dtype)):
+                "complex" in ret.outputs[0].dtype or
+                "complex" in self._acc_dtype(x.dtype)):
            raise NotImplementedError("We don't support complex in gpu reduction")
        return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
                                              ret.outputs[0].type.broadcastable)()])
@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            return False
        return True

-    def c_header_dirs(self):
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, name, inp, out, sub):
        x, = inp
@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

        sio = StringIO()
        fail = sub['fail']
+        ctx = sub['context']

        # check input
        print("""
@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
                                %(out_typecode)s, GA_C_ORDER,
-                                pygpu_default_context(),
-                                Py_None);
+                                pygpu_default_context(), Py_None);
            if (NULL == %(z)s)
            {
                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
            # TODO: check if we are ccontiguous when we un-dimshuffle
            # TODO: if only some dims are ccontiguous, call version with less dims.
-            print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals(), file=sio)
+            print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
+                  file=sio)
            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
            print("}else{", file=sio)
-            getattr(self, 'c_code_reduce_%s'%(''.join(
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+            getattr(self, 'c_code_reduce_%s' %
+                    (''.join(str(i) for i in self.reduce_mask)))(
+                sio, node, name, x, z, fail)
            print("}", file=sio)
        else:
-            getattr(self, 'c_code_reduce_%s'%(''.join(
+            getattr(self, 'c_code_reduce_%s' % (''.join(
                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)

        # \end bracket the reduction ...
@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        else:
            assert isinstance(self.scalar_op, (scal.Maximum,
                                               scal.Minimum))
-            if self.pre_scalar_op:  # TODO, multi_dtype!
-                #dtype = node.inputs[0].dtype
+            if self.pre_scalar_op:  # TODO: multiple dtypes
+                # dtype = node.inputs[0].dtype
                dtype = 'float32'

                dummy_var = scal.Scalar(dtype=dtype)()
@@ -1171,7 +1139,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

        Parameters
        ----------
-        node, name, sub 
+        node, name, sub
            These should be passed through from the original call to c_code.

        """
@@ -1411,7 +1379,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

    def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
        """
-        
+
        Parameters
        ----------
        N
@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [16]  # the version corresponding to the c code in this Op
+        version = [17]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
-        scalar_node = Apply(self.scalar_op,
-                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
-                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
        version.extend(self.scalar_op.c_code_cache_version())
        for i in node.inputs + node.outputs:
            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        in_dtype = node.inputs[0].dtype
        out_dtype = node.outputs[0].dtype
        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
-        flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
+        flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
        in_type = gpuarray.dtype_to_ctype(in_dtype)
        out_type = gpuarray.dtype_to_ctype(out_dtype)
        acc_type = gpuarray.dtype_to_ctype(acc_dtype)
@@ -2106,10 +2071,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                ]
            kernels.append(Kernel(code=sio.getvalue(), name=kname,
                                  params=params, flags=flags, objvar=k_var))
-        #01, 011, 0111
+        # 01, 011, 0111
        if (0 == self.reduce_mask[0] and
-            all(self.reduce_mask[1:]) and
-            nd_in in[2, 3, 4]):
+                all(self.reduce_mask[1:]) and
+                nd_in in[2, 3, 4]):
            # this kernel uses one block for each row.
            # threads per block for each element per row.

@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            # this kernel uses one block for multiple column(up to 32TODO),
            # threads per block for each element per column.

-# thread.x = dim 2 contiguous
-# thread.y = dim 1
-# block.x = dim 0
-# block.y = dim 1 rest
+            # thread.x = dim 2 contiguous
+            # thread.y = dim 1
+            # block.x = dim 0
+            # block.y = dim 1 rest
            init = self._k_init(node, nodename)
            decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
@@ -2515,7 +2480,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kernels.append(Kernel(code=sio.getvalue(), name=kname,
                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 0, 1, 1):
-             # this kernel uses one block for each row,
+            # this kernel uses one block for each row,
            # threads per block for each element per row.
            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
                                           node, nodename, sub={})
@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
            kname = "kernel_reduce_1011"
-            k_var= "kernel_reduce_1011_" + nodename
+            k_var = "kernel_reduce_1011_" + nodename
            sio = StringIO()
            print("""
            KERNEL void %(kname)s(
@@ -2712,7 +2677,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        # cache the kernel object
        self.get_kernel_cache(node)
        return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
-                                                   compute_map, no_recycling)
+                                                      compute_map, no_recycling)

    def get_kernel_cache(self, node):
        attr = '@cache_reduction_k'
@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
                                              acc_dtype,
                                              node.outputs[0].type.dtype),
-                       objvar='k_reduk_'+name)]
+                       objvar='k_reduk_' + name)]

    def c_code(self, node, name, inp, out, sub):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
@@ -2768,8 +2733,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):

        if (%(sync)d)
            GpuArray_sync(&%(out)s->ga);
-""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
-           sync=bool(config.gpuarray.sync))
+        """ % dict(out=out[0], inp=inp[0], fail=sub['fail'],
+                   sync=bool(config.gpuarray.sync))
        k = self.get_kernel_cache(node)
        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
                                            node.inputs[0].ndim)
@@ -2816,8 +2781,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                 %(fail)s
             }
         }
-""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
-           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
+        """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
+                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        else:
            code += """
        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
@@ -2828,8 +2793,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s
            }
        }
-""" % dict(output=output, fail=sub['fail'],
-           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
+        """ % dict(output=output, fail=sub['fail'],
+                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))

        if acc_dtype != node.outputs[0].type.dtype:
            code += """
@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
                          Py_None);
        if (!tmp) %(fail)s
-""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
+        """ % dict(output=output, fail=sub['fail'],
+                   acc_type=dtype_to_typecode(acc_dtype))
        else:
            code += """
        tmp = %(output)s;
        Py_INCREF(tmp);
-""" % dict(output=output)
+        """ % dict(output=output)

        # We need the proxies since we are passing a pointer to the
        # data into the call and therefore we need a real copy of the
@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        code += """
        args[0] = &n;
        args[1] = tmp->ga.data;
-""" % dict(output=output)
+        """ % dict(output=output)

        p = 2
        for i in range(node.inputs[0].ndim):
@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
        args[%(p)s] = &proxy_dim[%(i)s];
        n *= %(input)s->ga.dimensions[%(i)s];
-""" % dict(i=i, p=p, input=input)
+        """ % dict(i=i, p=p, input=input)
            p += 1
            if not redux[i]:
                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        args[%(p)s] = %(input)s->ga.data;
        proxy_off = %(input)s->ga.offset;
        args[%(p)s+1] = &proxy_off;
-""" % dict(p=p, input=input)
+        """ % dict(p=p, input=input)
        p += 2

        for i in range(node.inputs[0].ndim):
            code += """
        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
        args[%(p)s] = &proxy_str[%(i)s];
-""" % dict(p=p, i=i, input=input)
+        """ % dict(p=p, i=i, input=input)
            p += 1

        code += """
@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s
            }
        }
-""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
-           ls=ls, fail=sub['fail'], output=output, input=input,
-           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
+        """ % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
+                   ls=ls, fail=sub['fail'], output=output, input=input,
+                   cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

        return code

    def c_code_cache_version(self):
-        return (1, self.GpuKernelBase_version)
+        return (2, self.GpuKernelBase_version)

    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):
@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            redux = self.redux

        if any(redux):
-            output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
-                                             dtype=node.outputs[0].type.dtype)
+            output[0] = self.get_kernel_cache(node)(input).astype(
+                copy=False, dtype=node.outputs[0].type.dtype)
        else:
            output[0] = pygpu.gpuarray.array(input, copy=True,
                                             dtype=node.outputs[0].type.dtype)

--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
 """

 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass

+
 def nvcc_kernel(name, params, body):
    """
    Return the c code of a kernel function.
@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):

    """
    ctype = gpuarray.dtype_to_ctype(dtype)
-    return [
-            # get max of buf (trashing all but buf[0])
-            inline_reduce_max(N, buf, threadPos, threadCount),
+    # get max of buf (trashing all but buf[0])
+    return [inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
-                  '; __i+=' + threadCount + '){',
-                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
-                buf2 + '[__i] = ' + buf + '[__i]',
+            '; __i+=' + threadCount + '){',
+            buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
+            buf2 + '[__i] = ' + buf + '[__i]',
            '}',
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
-                  '; __i+=' + threadCount + '){',
-                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+            '; __i+=' + threadCount + '){',
+            buf + '[__i] = ' + buf2 + '[__i] / row_sum',
            '}',
            '__syncthreads()',
            ]
@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
        Optional, the dtype of the output.
    manner_fn
        A function that accepts strings of arguments a and b, and returns c code
-        for their reduction. 
+        for their reduction.
        Example: return "%(a)s + %(b)s" for a sum reduction.
    manner_init
        A function that accepts strings of arguments a and return c code for its
@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
        loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
                                                 locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
-                          "%s[i]" % buf)
+                           "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,

    Parameters
    ----------
-    N 
+    N
        Length of the buffer, atleast waprSize(32).
    buf
        A shared memory buffer of size warpSize * sizeof(dtype).

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
-import os
 import numpy

 from theano import Op, Apply, config
-from theano.gof import local_optimizer
 from theano.tensor.nnet.neighbours import Images2Neibs
 import theano.tensor as T

 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass

-from .basic_ops import (as_gpuarray_variable,
-                        host_from_gpu, gpu_from_host,
-                        GpuKernelBase, Kernel)
+from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType
-from .comp import NVCC_compiler


 class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                   dtype=ten4.type.dtype)()])

    def c_code_cache_version(self):
-        return (10,1)
+        return (11,)

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):


 @opt.register_opt()
-@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
 def local_gemm16_alpha_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


 @opt.register_opt()
-@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
 def local_gemm16_output_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
 from __future__ import print_function
 import numpy
-import os

 from theano import Op, Apply, config
 from six import StringIO

 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass

 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
 from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
-                            inline_softmax,
-                            inline_softmax_fixed_shared)
+                             inline_softmax,
+                             inline_softmax_fixed_shared)
 from .fp16_help import work_dtype, load_w, write_w


@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])

-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/types.h>']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        return sio.getvalue()

    def c_code_cache_version(self):
-        return (7,)
+        return (8,)


 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return Apply(self, [dnll, sm, y_idx], [sm.type()])

    def c_code_cache_version(self):
-        return (10,)
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
+        return (11,)

    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/types.h>']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
        return shape

    def c_code_cache_version(self):
-        return (14,) + inline_softmax.code_version
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
+        return (15,) + inline_softmax.code_version

    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
        dtype_x = node.inputs[0].dtype
@@ -665,60 +637,60 @@ class GpuSoftmax(GpuKernelBase, Op):
            ]
        kernels = []
        kname = "kSoftmax"
-        k_var= "kSoftmax_" + nodename
-        code = nvcc_kernel(kname,
-                params=['const ga_size M', 'const ga_size N',
+        k_var = "kSoftmax_" + nodename
+        code = nvcc_kernel(
+            kname,
+            params=['const ga_size M', 'const ga_size N',
                    'const %s * x' % type_x, 'const ga_size offset_x',
                    'const ga_ssize sx0', 'const ga_ssize sx1',
                    '%s * sm' % type_sm, 'const ga_size offset_sm',
                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
-                    "extern __shared__ %s buf[]" % type_acc,
-                    "%s * buf2 = buf + N" % type_acc,
-                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
-                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
-                        "buf2[tx] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                      inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
-                                     'blockDim.x', dtype=work_sm),
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
-                      "}",
-                      "__syncthreads()",
-                    "}",
-                ])
+            body=["extern __shared__ %s buf[]" % type_acc,
+                  "%s * buf2 = buf + N" % type_acc,
+                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                  "     blockIDX += gridDim.x){",
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
+                  "buf2[tx] = buf[tx]",
+                  "}",
+                  "__syncthreads()",
+                  inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
+                                 'blockDim.x', dtype=work_sm),
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  # This set all value correctly
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
+                  "}",
+                  "__syncthreads()",
+                  "}",
+                  ])
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        kname = "kSoftmax_fixed_shared"
-        k_var= "kSoftmax_fixed_shared" + nodename
-        code = nvcc_kernel(kname,
-                params=['const ga_size M', 'const ga_size N',
+        k_var = "kSoftmax_fixed_shared" + nodename
+        code = nvcc_kernel(
+            kname,
+            params=['const ga_size M', 'const ga_size N',
                    'const %s * x' % type_x, 'const ga_size offset_x',
                    'const ga_ssize sx0', 'const ga_ssize sx1',
                    '%s * sm' % type_sm, 'const ga_size offset_sm',
                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
-                    "extern __shared__ %s buf[]" % type_acc,
-                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
-                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
-                      "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
-                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
-                                                  load_x,
-                                                  'sm_ptr', 'sm_s1', write_sm,
-                                                  'threadIdx.x', 'blockDim.x',
-                                                  dtype=work_sm),
-                      "__syncthreads()",
-                    "}",
-                    ])
+            body=["extern __shared__ %s buf[]" % type_acc,
+                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                  "     blockIDX += gridDim.x){",
+                  "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
+                  "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
+                  inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                              load_x,
+                                              'sm_ptr', 'sm_s1', write_sm,
+                                              'threadIdx.x', 'blockDim.x',
+                                              dtype=work_sm),
+                  "__syncthreads()",
+                  "}",
+                  ])
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels
@@ -743,26 +715,13 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        return Apply(self, [x, b], [x.type()])

    def infer_shape(self, node, shape):
-        return  [shape[0]]
+        return [shape[0]]

    def c_code_cache_version(self):
-        return (13,) + inline_softmax.code_version
-
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
+        return (14,) + inline_softmax.code_version

    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
        dtype_x = node.inputs[0].dtype
@@ -892,65 +851,65 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
-        code = nvcc_kernel(kname,
-                params=['const ga_size M', 'const ga_size N',
-                        'const %s * x' % type_x, 'const ga_size offset_x',
-                        'const ga_ssize sx0', 'const ga_ssize sx1',
-                        'const %s * b' % type_b, 'const ga_size offset_b',
-                        'const ga_ssize sb0',
-                        '%s * sm' % type_sm, 'const ga_size offset_sm',
-                        'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
-                    "extern __shared__ %s buf[]" % type_acc,
-                    "%s * buf2 = buf + N" % type_acc,
-                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
-                    "b = (const %s *)(((char *)b)+offset_b)" % type_b,
-                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
-                         "buf[tx] += %s(b[tx * sb0])" % load_b,
-                         "buf2[tx] = buf[tx]",
-                      "}",
-                       "__syncthreads()",
-                       inline_softmax('N', 'buf', 'buf2',
-                                      'threadIdx.x', 'blockDim.x', work_sm),
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
-                      "}",
-                      "__syncthreads()",
-                    "}",
-                    ])
+        code = nvcc_kernel(
+            kname,
+            params=['const ga_size M', 'const ga_size N',
+                    'const %s * x' % type_x, 'const ga_size offset_x',
+                    'const ga_ssize sx0', 'const ga_ssize sx1',
+                    'const %s * b' % type_b, 'const ga_size offset_b',
+                    'const ga_ssize sb0',
+                    '%s * sm' % type_sm, 'const ga_size offset_sm',
+                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
+            body=["extern __shared__ %s buf[]" % type_acc,
+                  "%s * buf2 = buf + N" % type_acc,
+                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                  "b = (const %s *)(((char *)b)+offset_b)" % type_b,
+                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                  "     blockIDX += gridDim.x){",
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
+                  "buf[tx] += %s(b[tx * sb0])" % load_b,
+                  "buf2[tx] = buf[tx]",
+                  "}",
+                  "__syncthreads()",
+                  inline_softmax('N', 'buf', 'buf2',
+                                 'threadIdx.x', 'blockDim.x', work_sm),
+                  "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                  "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
+                  "}",
+                  "__syncthreads()",
+                  "}",
+                  ])
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
-        code = nvcc_kernel(kname,
-                params=['const ga_size M', 'const ga_size N',
-                        'const %s * x' % type_x, 'const ga_size offset_x',
-                        'const ga_ssize sx0', 'const ga_ssize sx1',
-                        'const %s * b' % type_b, 'const ga_size offset_b',
-                        'const ga_ssize sb0',
-                        '%s * sm' % type_sm, 'const ga_size offset_sm',
-                        'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
-                    "extern __shared__ %s buf[]" % type_acc,
-                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
-                    "b = (const %s *)(((char *)b)+offset_b)" % type_b,
-                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                    "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
-                    "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
-                    inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
-                                                load_x,
-                                                'sm_ptr', 'sm_s1', write_sm,
-                                                'threadIdx.x', 'blockDim.x',
-                                                'b', 'sb0', load_b, work_sm),
-                    "__syncthreads()",
-                    "}",
-                    ])
+        code = nvcc_kernel(
+            kname,
+            params=['const ga_size M', 'const ga_size N',
+                    'const %s * x' % type_x, 'const ga_size offset_x',
+                    'const ga_ssize sx0', 'const ga_ssize sx1',
+                    'const %s * b' % type_b, 'const ga_size offset_b',
+                    'const ga_ssize sb0',
+                    '%s * sm' % type_sm, 'const ga_size offset_sm',
+                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
+            body=["extern __shared__ %s buf[]" % type_acc,
+                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                  "b = (const %s *)(((char *)b)+offset_b)" % type_b,
+                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
+                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                  "     blockIDX += gridDim.x){",
+                  "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
+                  "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
+                  inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                              load_x,
+                                              'sm_ptr', 'sm_s1', write_sm,
+                                              'threadIdx.x', 'blockDim.x',
+                                              'b', 'sb0', load_b, work_sm),
+                  "__syncthreads()",
+                  "}",
+                  ])
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):


 @register_opt()
-@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
 def local_gpuagemm_alpha_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]


 @register_opt()
-@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
 def local_gpuagemm_output_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]


--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

-from .basic_ops import GpuFromHost, HostFromGpu
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
 from .elemwise import GpuDimShuffle, GpuElemwise

 _one = scal.constant(numpy.asarray(1.0, dtype='float64'))


 def grab_cpu_scalar(v, nd):
+    """
+    Get a scalar variable value from the tree at `v`.
+
+    This function will dig through transfers and dimshuffles to get
+    the constant value. If no such constant is found, it returns None.
+
+    Parameters
+    ----------
+    v : variable
+        Theano variable to extract the constant value from.
+    nd : int
+        Expected number of dimensions for the variable (for
+        broadcasted constants).
+
+    """
    if v.owner is not None:
        n = v.owner
-        if (isinstance(n.op, GpuDimShuffle) and
+        if (isinstance(n.op, (GpuDimShuffle, DimShuffle)) and
                n.op.new_order == ('x',) * nd):
-            return grab_cpu_scalar(n.inputs[0])
-        elif (isinstance(n.op, DimShuffle) and
-              n.op.new_order == ('x',) * nd):
-            return grab_cpu_scalar(n.inputs[0])
-        elif isinstance(n.op, GpuFromHost):
-            return grab_cpu_scalar(n.inputs[0], nd=nd)
+            return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
+        elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
+            return grab_cpu_scalar(n.inputs[0], nd)
        else:
            return None
    else:
@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):


 def find_node(v, cls, ignore_clients=False):
-    # This digs through possibly redundant transfers to for the node
-    # that has the op class specified. If ignore_clients is False (the
-    # default) it will only dig through nodes that have a single
-    # client.
+    """
+    Find the node that has an op of of type `cls` in `v`.
+
+    This digs through possibly redundant transfers to for the node
+    that has the type `cls`. If `ignore_clients` is False (the
+    default) it will only dig through nodes that have a single client
+    to avoid duplicating computations.
+
+    Parameters
+    ----------
+    v : variable
+        The variable to dig through
+    cls : Op class
+        The type of the node we are looking for
+    ignore_clients : bool, optional
+        Whether to ignore multiple clients or not.
+
+    """
    if v.owner is not None and (ignore_clients or len(v.clients) == 1):
        if isinstance(v.owner.op, cls):
            return v.owner
@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):


 def is_equal(var, val):
-    # Returns True if var is always equal to val (python value), False
-    # otherwise (including if var is not constant)
+    """
+    Returns True if `var` is always equal to `val`.
+
+    This will only return True if the variable will always be equal to
+    the value.  If it might not be true in some cases then it returns False.
+
+    Parameters
+    ----------
+    var : variable
+        Variable to compare
+    val : value
+        Python value
+
+    """
    try:
        v = get_scalar_constant_value(var)
        return v == val
@@ -59,7 +97,57 @@ def is_equal(var, val):
        return False


-def alpha_merge(cls, alpha_in, beta_in, nd):
+def alpha_merge(cls, alpha_in, beta_in):
+    """
+    Decorator to merge multiplication by a scalar on the output.
+
+    This will find a pattern of scal * <yourop>(some, params, alpha,
+    beta) and update it so that the scalar multiplication happens as
+    part of your op.
+
+    The op needs to accept an alpha and a beta scalar which act this way:
+
+       out = Op() * alpha + out_like * beta
+
+    Where out_like is a buffer that has the same size as the output
+    and gets added to the "real" output of the operation.  An example
+    of an operation that respects this pattern is GEMM from blas.
+
+    The decorated function must have this signature:
+
+        maker(node, *inputs)
+
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    The `*inputs` parameters contains the new inputs for your op.  You
+    MUST use those inputs instead of the ones on `node`.  Note that
+    this function can be as simple as:
+
+        def maker(node, *inputs):
+            return node.op(*inputs)
+
+    Parameters
+    ----------
+    cls : op class
+        The class of the op you want to merge
+    alpha_in : int
+        The input index for the alpha scalar for your op (in node.inputs).
+    beta_in : int
+        The input index for the beta scalar for your op (in node.inputs).
+
+    Returns
+    -------
+    This returns an unregistered local optimizer that has the same
+    name as the decorated function.
+
+    Notes
+    -----
+    This was factored out since the code to deal with intervening
+    transfers and correctness in the presence of different values of
+    alpha and beta scaling factors is not trivial.
+
+    """
    def wrapper(maker):
        @local_optimizer([GpuElemwise])
        @wraps(maker)
@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
                targ = find_node(node.inputs[0], cls)
                if targ is None:
                    targ = find_node(node.inputs[1], cls)
-                    lr = grab_cpu_scalar(node.inputs[0], nd=nd)
+                    if targ is None:
+                        return
+                    lr = grab_cpu_scalar(node.inputs[0],
+                                         nd=targ.outputs[0].ndim)
                else:
-                    lr = grab_cpu_scalar(node.inputs[1], nd=nd)
-                if (lr is None or targ is None or
-                        lr.dtype != targ.outputs[0].dtype):
+                    lr = grab_cpu_scalar(node.inputs[1],
+                                         nd=targ.outputs[0].ndim)
+                if lr is None or lr.dtype != targ.outputs[0].dtype:
                    return None
                inputs = list(targ.inputs)
                try:
@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
    return wrapper


-def output_merge(cls, alpha_in, beta_in, out_in, nd):
+def output_merge(cls, alpha_in, beta_in, out_in):
+    """
+    Decorator to merge addition by a value on the output.
+
+    This will find a pattern of val * <yourop>(some, params, alpha,
+    beta, out_like) and update it so that the addtition happens as
+    part of your op.
+
+    The op needs to accept an alpha and a beta scalar which act this way:
+
+       out = Op() * alpha + out_like * beta
+
+    Where out_like is a buffer that has the same size as the output
+    and gets added to the "real" output of the operation.  An example
+    of an operation that respects this pattern is GEMM from blas.
+
+    The decorated function must have this signature:
+
+        maker(node, *inputs)
+
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    The `*inputs` parameters contains the new inputs for your op.  You
+    MUST use those inputs instead of the ones on `node`.  Note that
+    this function can be as simple as:
+
+        def maker(node, *inputs):
+            return node.op(*inputs)
+
+    Parameters
+    ----------
+    cls : op class
+        The class of the op you want to merge
+    alpha_in : int
+        The input index for the alpha scalar for your op (in node.inputs).
+    beta_in : int
+        The input index for the beta scalar for your op (in node.inputs).
+    out_in : int
+        The input index for the out_like input for your op (in node.inputs).
+
+    Returns
+    -------
+    This returns an unregistered local optimizer that has the same
+    name as the decorated function.
+
+    Notes
+    -----
+    This was factored out since the code to deal with intervening
+    transfers and correctness in the presence of different values of
+    alpha and beta scaling factors is not trivial.
+
+    This also correctly handles the case where the added value is
+    broadcasted (by not performing the replacement).
+
+    """
    def wrapper(maker):
        @local_optimizer([GpuElemwise])
        @wraps(maker)
@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
                return maker(targ, *inputs)
        return opt
    return wrapper
+
+
+def inplace_allocempty(op, idx):
+    """
+    Wrapper to make an inplace optimization that deals with AllocEmpty
+
+    This will duplicate the alloc input if it has more than one client
+    to allow the op to work on it inplace.
+
+    The decorated function must have this signature:
+
+        maker(node, inputs)
+
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    You should also switch the op to work inplace.  The `*inputs`
+    parameters contains the new inputs for your op.  You MUST use
+    those inputs instead of the ones on `node`.  Note that this
+    function can be as simple as:
+
+        def maker(node, inputs):
+            return node.op.__class__(inplace=True)(*inputs)
+
+    Parameters
+    ----------
+    op : op class
+        The op class to look for to make inplace
+    idx : int
+        The index of the (possibly) AllocEmpty input (in node.inputs).
+
+    Returns
+    -------
+    This returns an unregistered inplace local optimizer that has the
+    same name as the decorated function.
+
+    """
+    def wrapper(maker):
+        @local_optimizer([op], inplace=True)
+        @wraps(maker)
+        def opt(node):
+            if type(node.op) != op or node.op.inplace:
+                return
+            inputs = list(node.inputs)
+            alloc = inputs[idx]
+            if (alloc.owner and
+                    isinstance(alloc.owner.op, GpuAllocEmpty) and
+                    len(alloc.clients) > 1):
+                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
+                inputs[idx] = alloc_op(*alloc.owner.inputs)
+            return maker(node, inputs)
+        return opt
+    return wrapper
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    def _f16_ok(self):
        return self.iadd_node.op._f16_ok

-    def c_header_dirs(self):
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-
    def c_headers(self):
        return self.iadd_node.op.c_headers()

-    def c_compiler(self):
-        return self.iadd_node.op.c_compiler()
-
    def c_init_code(self):
        return self.iadd_node.op.c_init_code()

@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        elemwise_version = self.iadd_node.c_code_cache_version()
        if not parent_version or not elemwise_version:
            return
-        return parent_version + elemwise_version + (2,)
+        return parent_version + elemwise_version + (3,)


 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
 import unittest
 from theano.compat import izip
-from copy import copy, deepcopy

 from six import iteritems

@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
 # Don't import test classes otherwise they get tested as part of the file
 from theano.tensor.tests import test_basic
 from theano.tensor.tests.test_basic import rand, safe_make_node
+from theano.tests import unittest_tools as utt
 from theano.tests.unittest_tools import SkipTest

 import theano.sandbox.gpuarray

+from ..type import (GpuArrayType,
+                    gpuarray_shared_constructor)
+from ..basic_ops import (
+    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
+    gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
+    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+from ..subtensor import GpuSubtensor
+
+import theano.sandbox.cuda as cuda_ndarray
+
+try:
+    from pygpu import gpuarray
+except:
+    pass
+
 if theano.sandbox.gpuarray.pygpu is None:
    raise SkipTest("pygpu not installed")

 # If you are writing a new test file, don't copy this code, but rather
 # import stuff from this file (like mode_with_gpu) to reuse it.
-import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
    if not cuda_ndarray.use.device_number:
        # We should not enable all the use like the flag device=gpu,
@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
 if not theano.sandbox.gpuarray.pygpu_activated:
    raise SkipTest("pygpu disabled")

-from ..type import (GpuArrayType,
-                    gpuarray_shared_constructor)
-from ..basic_ops import (
-    host_from_gpu, gpu_from_host,
-    gpu_alloc, GpuAlloc,
-    GpuAllocEmpty,
-    gpu_from_cuda,
-    cuda_from_gpu, HostFromGpu,
-    GpuContiguous,
-    GpuFromHost, GpuReshape,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
-from ..subtensor import GpuSubtensor
-
-from theano.tests import unittest_tools as utt
 utt.seed_rng()
 rng = numpy.random.RandomState(seed=utt.fetch_seed())

-from pygpu import gpuarray
-
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
@@ -63,22 +61,6 @@ else:
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')


-def may_fail(msg, EClass):
-    """Mark a test that requires very specific conditions to work to
-       mask a specific exception class."""
-    def test_decorator(f):
-        def wrapper():
-            try:
-                f()
-            except Exception as e:
-                if isinstance(e, EClass):
-                    raise SkipTest(msg, e)
-                raise
-        wrapper.__name__ = f.__name__
-        return wrapper
-    return test_decorator
-
-
 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
    if mode is None:
@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                    else:
                        err_msg = ("Test %s::%s: exception raised during test "
                                   "call was not the same as the reference "
-                                   "call (got: %s, expected %s)") % \
+                                   "call (got: %s, expected %s)" %
                                   (self.gpu_op, testname, type(exc),
-                                    type(ref_e))
+                                    type(ref_e)))
                        exc.args += (err_msg,)
                        raise

@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                                                        expected):
                    self.fail(("Test %s::%s: Output %s gave the wrong "
                               "value. With inputs %s, expected %s "
-                               "(dtype %s), got %s (dtype %s).") % (
-                            self.op, testname, i, inputs, expected,
-                            expected.dtype, variable, variable.dtype))
+                               "(dtype %s), got %s (dtype %s)." %
+                               (self.op, testname, i, inputs, expected,
+                                expected.dtype, variable, variable.dtype)))

            for description, check in iteritems(self.checks):
                if not check(inputs, variables):
@@ -250,36 +232,6 @@ def test_transfer_strided():
    assert numpy.all(fv == av)


-@may_fail("Op fails if both contexts are not the same and it's rare "
-          "that the tests will be run this way", ValueError)
-def test_transfer_cuda_gpu():
-    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available is False:
-        raise SkipTest("Can't test interaction with cuda if cuda not present")
-    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
-
-    av = theano._asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
-    cv = cuda_ndarray.CudaNdarray(av)
-    gvs = gv[:, ::-2]
-    cvs = cv[:, ::-2]
-
-    f = theano.function([c], gpu_from_cuda(c))
-    fv = f(cv)
-    assert GpuArrayType.values_eq_approx(fv, gv)
-
-    fvs = f(cvs)
-    assert GpuArrayType.values_eq_approx(fvs, gvs)
-
-    f = theano.function([g], cuda_from_gpu(g))
-    fv = f(gv)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
-
-    fvs = f(gvs)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
-
-
 def gpu_alloc_expected(x, *shp):
    g = gpuarray.empty(shp, dtype=x.dtype)
    g[:] = x
@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
    gpu_op=gpu_alloc,
    cases=dict(
        correct01=(rand(), numpy.int32(7)),
-# just gives a DeepCopyOp with possibly wrong results on the CPU
-#        correct01_bcast=(rand(1), numpy.int32(7)),
+        # just gives a DeepCopyOp with possibly wrong results on the CPU
+        # correct01_bcast=(rand(1), numpy.int32(7)),
        correct02=(rand(), numpy.int32(4), numpy.int32(7)),
        correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
        correct13=(rand(7), numpy.int32(2), numpy.int32(4),
@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
                          dtype='float32')

-    gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
-    host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
    f = theano.function([a], gpu_from_host(a), mode=m)
    assert gpu_from_host in [x.op
                             for x in f.maker.fgraph.toposort()]

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
@@ -6,8 +6,7 @@ import numpy
 import theano
 from theano import tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
-                                _dot22)
+from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
 from theano.tensor.tests.test_blas import TestGer, BaseGemv

 from .. import gpuarray_shared_constructor
@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
                             mode_with_gpu)

 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
-                    gpugemm_inplace, gpugemm_no_inplace,
+                    gpugemm_inplace,
                    gpuger_inplace, gpuger_no_inplace,
                    GpuGer, gpu_dot22, GpuGemm)


-GpuGemvTester = makeTester('GpuGemvTester',
-                           op=gemv_inplace, gpu_op=gpugemv_inplace,
-                           cases=dict(
-        dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
-        dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
-#        test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
-#        test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
-#        test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
-        test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
-        )
-)
+GpuGemvTester = makeTester(
+    'GpuGemvTester',
+    op=gemv_inplace, gpu_op=gpugemv_inplace,
+    cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
+               dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
+               # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
+               # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
+               # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
+               test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
+               )
+    )


 class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
            return theano.shared(val)


-GpuGemmTester = makeTester('GpuGemmTester',
-                           op=gemm_inplace, gpu_op=gpugemm_inplace,
-                           cases=dict(
-        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
-        test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
-        test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
-        test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
-        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
-        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
-        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
-        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
- #       test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
- #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
- #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
- #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-        )
-)
+GpuGemmTester = makeTester(
+    'GpuGemmTester',
+    op=gemm_inplace, gpu_op=gpugemm_inplace,
+    cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
+               test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
+               test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
+               test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
+               test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
+               test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
+               test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
+               test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
+               test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
+               # test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
+               # test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
+               # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
+               # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
+               )
+    )


 class TestGpuSger(TestGer):
@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):

    def test_f32_0_0(self):
        raise SkipTest('0-sized objects not supported')
+
    def test_f32_1_0(self):
        raise SkipTest('0-sized objects not supported')
+
    def test_f32_0_1(self):
        raise SkipTest('0-sized objects not supported')

@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):


 GpuDot22Tester = makeTester(
-    'GpuGemmTester',
+    'GpuDot22Tester',
    op=_dot22, gpu_op=gpu_dot22,
    cases=dict(
        test1=[rand(3, 4), rand(4, 5)],
        test2=[rand(1, 4), rand(4, 5)],
        test3=[rand(3, 1), rand(1, 5)],
        test4=[rand(3, 4), rand(4, 1)],
-#        test5=[rand(0, 4), rand(4, 5)],
-#        test6=[rand(3, 0), rand(0, 5)],
-#        test7=[rand(3, 4), rand(4, 0)],
-#        test8=[rand(0, 4), rand(4, 0)],
-#        test9=[rand(0, 0), rand(0, 0)],
+        # test5=[rand(0, 4), rand(4, 5)],
+        # test6=[rand(3, 0), rand(0, 5)],
+        # test7=[rand(3, 4), rand(4, 0)],
+        # test8=[rand(0, 4), rand(4, 0)],
+        # test9=[rand(0, 0), rand(0, 0)],
    )
 )

+
 def test_hgemm_swap():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
@@ -149,6 +151,7 @@ def test_hgemm_swap():

    utt.assert_allclose(of, on)

+
 def test_hgemm_alpha_output_merge():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -6,37 +6,36 @@ import sys
 import time
 import unittest

-
 import numpy
 from six.moves import xrange

-from nose.plugins.skip import SkipTest
-imported_scipy_convolve2d = False
-try:
-    from scipy.signal import convolve2d
-    imported_scipy_convolve2d = True
-except ImportError:
-    pass
-
 import theano
 from theano import tensor
 from theano.tests.unittest_tools import seed_rng

 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
-                             mode_without_gpu)
+from .test_basic_ops import mode_with_gpu
 from ..type import GpuArrayType
 from ..conv import GpuConv
 from theano.sandbox.gpuarray import dnn

 import pygpu
+
+imported_scipy_convolve2d = False
+try:
+    from scipy.signal import convolve2d
+    imported_scipy_convolve2d = True
+except ImportError:
+    pass
+
 gftensor4 = GpuArrayType('float32', [False] * 4)

+
 def py_conv_valid_numpy(img, kern):
    assert img.shape[1] == kern.shape[1]
    outshp = (img.shape[0], kern.shape[0],
-            img.shape[2] - kern.shape[2] + 1,
-            img.shape[3] - kern.shape[3] + 1)
+              img.shape[2] - kern.shape[2] + 1,
+              img.shape[3] - kern.shape[3] + 1)
    out = numpy.zeros(outshp, dtype='float32')
    for b in xrange(out.shape[0]):
        for k in xrange(out.shape[1]):
@@ -60,7 +59,7 @@ def py_conv_full_numpy(img, kern):
    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
                             dtype=img.dtype)
    padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
-                     kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
+               kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
    return py_conv_valid_numpy(padded_img, kern)


@@ -73,10 +72,10 @@ def py_conv(img, kern, mode, subsample):
        return py_conv_scipy(img, kern, mode, subsample)
    elif mode == 'valid':
        return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
-                                                      ::subsample[1]]
+                                              ::subsample[1]]
    elif mode == 'full':
        return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
-                                                     ::subsample[1]]
+                                             ::subsample[1]]
    else:
        raise Exception("Can't execute this kernel.")

@@ -85,12 +84,12 @@ def py_conv_scipy(img, kern, mode, subsample):
    assert img.shape[1] == kern.shape[1]
    if mode == 'valid':
        outshp = (img.shape[0], kern.shape[0],
-                img.shape[2] - kern.shape[2] + 1,
-                img.shape[3] - kern.shape[3] + 1)
+                  img.shape[2] - kern.shape[2] + 1,
+                  img.shape[3] - kern.shape[3] + 1)
    else:
        outshp = (img.shape[0], kern.shape[0],
-                img.shape[2] + kern.shape[2] - 1,
-                img.shape[3] + kern.shape[3] - 1)
+                  img.shape[2] + kern.shape[2] - 1,
+                  img.shape[3] + kern.shape[3] - 1)
    out = numpy.zeros(outshp, dtype='float32')
    for b in xrange(out.shape[0]):
        for k in xrange(out.shape[1]):
@@ -133,9 +132,9 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
-                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
+            numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
-                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
+            numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
    img = pygpu.array(npy_img)
    kern = pygpu.array(npy_kern)

@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        cpu_mflops = approx_fp / (t1 - t0)
        gpu_mflops = approx_fp / (t2 - t1)
        if verbose > 0:
-            print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout)
-            print('%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
-                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout)
+            print('%15s' % str(ishape), '%15s' % str(kshape), end=' ',
+                  file=sys.stdout)
+            print('%12.5f  %7.2f %7.2f %7.1f' %
+                  (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
+                  file=sys.stdout)
    if not rval:
-        print(('test_' + mode + ' id=' + str(id) +
-                              ' FAILED for ishape, kshape, mode, subsample,' +
-                              ' img_stride, kern_stride, version', ishape,
-                              kshape, mode, subsample, img_stride, kern_stride,
-                              version), file=sys.stdout)
+        print('test_' + mode + ' id=' + str(id) +
+              ' FAILED for ishape, kshape, mode, subsample,' +
+              ' img_stride, kern_stride, version', ishape,
+              kshape, mode, subsample, img_stride, kern_stride,
+              version, file=sys.stdout)
        diff = cpuval - gpuval
        diffabs = numpy.absolute(diff)
        pr_diff = diffabs / numpy.absolute(cpuval)
@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                                   nb_close, "/", diff.size))
        print("max relatif diff:", (pr_diff.max(), "avg rel diff:",
                                    numpy.average(pr_diff)))
-    if not rval and print_ != False:
+    if not rval and print_ is not False:
        if npy_img.shape[0] > 5:
            print("img", npy_img[0])
            print("kern", npy_kern[0])
@@ -242,19 +243,20 @@ def exec_conv(version, shapes, verbose, random, mode,
                 istride, kstride) in enumerate(shapes):
            ret = False
            try:
-                ret = _params_allgood(ishape,
-                        kshape,
-                        mode,
-                        subsample=subshape,
-                        img_stride=istride,
-                        kern_stride=kstride,
-                        version=ver,
-                        verbose=verbose,
-                        random=random,
-                        id=id,
-                        print_=print_,
-                        rtol=rtol,
-                        ones=ones)
+                ret = _params_allgood(
+                    ishape,
+                    kshape,
+                    mode,
+                    subsample=subshape,
+                    img_stride=istride,
+                    kern_stride=kstride,
+                    version=ver,
+                    verbose=verbose,
+                    random=random,
+                    id=id,
+                    print_=print_,
+                    rtol=rtol,
+                    ones=ones)
            except Exception as e:
                print(ver, id, (ishape, kshape, subshape, istride, kstride))
                print(e)
@@ -273,11 +275,11 @@ def exec_conv(version, shapes, verbose, random, mode,


 def get_basic_shapes():
-        # basic test of image and kernel shape
+    # basic test of image and kernel shape
    return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
            ((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
            ((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-        # basic test for unsquare kernel and image
+            # basic test for unsquare kernel and image
            ((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
            ((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
            ((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
@@ -297,17 +299,17 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
        ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
        # nkern only
        ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and nkern
+        # batch and nkern
        ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and stack
+        # batch and stack
        ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        #stack and nkern
+        # stack and nkern
        ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
+        # batch, nkern and stack
        ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
+        # batch, nkern and stack
        ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
-    ]
+        ]


 def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),


 def get_valid_shapes():
-
    #          img shape,     kern shape, subsample shape

    shapes = get_basic_shapes()
@@ -360,39 +361,36 @@ def get_valid_shapes():
    # test subsample done in a separate fct

    shapes += [
-         # other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
-            , ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1))  # added a test case that fail from test_nnet.py.test_conv_nnet2
-            , ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1))  # test precedent but reduced that triger the error
-            # Test more than maxThreadsDim0
-            , ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
-            , ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
-            ]
-
-    shapes += [ ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 1 layers
-            , ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 2 layers
-            , ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 1 full
-            , ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-            , ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 1 layers
-            , ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 2 layers
-            , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
-            ]
+        # other test
+        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
+        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
+        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
+        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
+        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
+        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
+        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights
+        ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)),  # added a test case that fail from test_nnet.py.test_conv_nnet2
+        ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)),  # test precedent but reduced that triger the error
+        # Test more than maxThreadsDim0
+        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
+        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
+        ]
+
+    shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 1 layers
+               ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 2 layers
+               ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
+               ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
+               ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 1 layers
+               ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 2 layers
+               ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
+               ]
    return shapes


@@ -428,43 +426,35 @@ def test_full():

    shapes += [
        # other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
+        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
+        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
+        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
+        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
+        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
+        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
+        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights

        # other test
-            , ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
-            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
-            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
-            ]
+        ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)),  # kernel bigger then image
+        ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
+        ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
+        ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)),  # kernel bigger then image
+        ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)),
+        ]
    shapes += [
-#        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
-#            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
-             ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 1 full
-#            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
-#            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
-            , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
-            # Test more than maxThreadsDim0
-            , ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
-            , ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
-            ]
+        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
+        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
+        # Test more than maxThreadsDim0
+        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
+        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
+        ]

    version = [-1]
    verbose = 0
@@ -481,7 +471,7 @@ def test_subsample():
              ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
-          ]
+              ]
    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
                for mode in ['valid', 'full']:
                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
-                                   #((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
                                   # We use only the number of columns.
                                   ]:

@@ -580,47 +569,45 @@ def benchmark():

    shapes_valid = [
        # test_lenet_28 shape
-        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1))  # valid
-        , ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
-        , ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
-        , ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # valid
+        ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
+        ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
+        ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_32 shape
-        , ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1))  # valid
-        , ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
-        , ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
-        , ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)),  # valid
+        ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
+        ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
+        ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_64 shape
-        , ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
-        , ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1))  # valid
-        , ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
-        , ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
+        ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)),  # valid
+        ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
+        ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_108 shape
-        , ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
-        , ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1))  # valid
-        , ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
-        , ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
+        ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)),  # valid
+        ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
+        ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_256 shape
-        , ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1))  # valid
-        , ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1))  # valid
-        , ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1))  # valid
-        , ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1))  # valid
-            ]
+        ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
+        ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)),  # valid
+        ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
+        ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)),  # valid
+        ]

    shapes_full = [
        # test_lenet_28 shape
-         ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # full
+        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full
        # test_lenet_32 shape
-         , ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # full conv_full_patch_stack_padded' N=1
+        ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=1
        # test_lenet_64 shape
-         , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # full conv_full_patch_stack_padded' N=3
+        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=3
        # test_lenet_108 shape
-         , ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # full 'conv_full_patch_stack_padded' N=9
+        ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full 'conv_full_patch_stack_padded' N=9
        # test_lenet_256 shape
-         , ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1))  # full conv_reference_full
-            ]
+        ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)),  # full conv_reference_full
+        ]

-#    shapes_valid=shapes_valid[-1:]
-#    shapes_full=shapes_full[-1:]
    version = [-1]
    verbose = 1
    random = True
@@ -640,6 +627,6 @@ def test_stack_rows_segfault_070312():
    kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
    out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
    op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
-            nkern=1, bsize=1)
+                                        nkern=1, bsize=1)
    f = theano.function([], [], updates=[(out, op(img, kern))], mode=mode_with_gpu)
    f()
--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
-import unittest

 from theano.tensor.nnet.tests import test_neighbours
 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
-                             mode_without_gpu)
+from .test_basic_ops import mode_with_gpu

 from ..neighbours import GpuImages2Neibs


--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
 from __future__ import print_function
-from nose.plugins.skip import SkipTest
+
 import numpy
 import unittest

@@ -7,8 +7,6 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt

-from theano.sandbox import gpuarray
-
 # We let that import do the init of the back-end if needed.
 from .test_basic_ops import (mode_with_gpu,
                             mode_without_gpu)
@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
        n_in = 4098
        n_out = 4099

-    x = T.fmatrix('x')
    y = T.lvector('y')

    b = T.fvector('b')
-    #W = T.fmatrix('W')

    # we precompute the dot with big shape before to allow the test of
    # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
-    #(the launch timed out and was terminated) on GPU card not
+    # (the launch timed out and was terminated) on GPU card not
    # powerful enough. We need the big shape to check for corner
    # case.
    dot_result = T.fmatrix('dot_result')
@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():

    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
                       dtype=numpy.float32)
-    #?????yy = numpy.ones((batch_size,),dtype='float32')
    yy = numpy.ones((batch_size,), dtype='int32')
    b_values = numpy.zeros((n_out,), dtype='float32')
    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    classify_gpu = theano.function(inputs=[y, b, dot_result],
                                   outputs=[loss, y_pred, dW],
                                   mode=mode_with_gpu)
-    # theano.printing.debugprint(classify)
-    # theano.printing.debugprint(classify_gpu)

    assert any([isinstance(node.op,
                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    We check that we loop when their is too much threads

    """
-    n_in = 1000
    batch_size = 4097
    n_out = 1250

    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
-        n_in = 4098
        n_out = 4099

    # Seed numpy.random with config.unittests.rseed
@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():

    rtol = 1e-5
    atol = 1e-6
-    if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
-        abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
-        scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
-        max_i = scaled_err.argmax()
-
-        print('max err index:', max_i, max_i / batch_size, end=' ')
-        print(max_i % batch_size, max_i / n_out, max_i & n_out)
-        print('At that index:')
-        print('err:', scaled_err.flatten()[max_i])
-        print('absolute error:', abs_err.flatten()[max_i])
-        print('relative error:', rel_err.flatten()[max_i])
-        print('cpu_out:', cpu_out.flatten()[max_i])
-        print('gpu_out:', gpu_out.flatten()[max_i])
-        print('softmax_output_value:', softmax_output_value.flatten()[max_i])
-        print('dnll_value:', dnll_value[max_i / n_out])
-        print('y_idx_value:', y_idx_value[max_i / n_out])
-
-        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
-            rtol, atol)
+    utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)


 def test_softmax_with_bias_float16():
@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
    softmax_with_bias_unittest_template(dtypeInput='float32',
                                        dtypeBias='float16')

+
 def test_softmax_with_bias_float32():
    softmax_with_bias_unittest_template(dtypeInput='float32',
                                        dtypeBias='float32')
@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):

    TODO: check that we loop when there are too many threads. (THIS IS
    NOT IMPLEMENTED)
+
    """
    x = T.matrix('x', dtype=dtypeInput)
    b = T.vector('b', dtype=dtypeBias)
@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
 def test_softmax_float16():
    softmax_unittest_template('float16')

+
 def test_softmax_float32():
    softmax_unittest_template('float32')

+
 def test_softmax_float64():
    softmax_unittest_template('float64')


--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
-import operator
-
 import numpy

 import theano
@@ -25,7 +23,6 @@ def test_deep_copy():

 def test_values_eq_approx():
    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
    assert GpuArrayType.values_eq_approx(a, a)
    b = a.copy()
    b[0] = numpy.asarray(b[0]) + 1.

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -184,7 +184,7 @@ class GpuArrayType(Type):
    @staticmethod
    def may_share_memory(a, b):
        if (not isinstance(a, gpuarray.GpuArray) or
-               not isinstance(b, gpuarray.GpuArray)):
+                not isinstance(b, gpuarray.GpuArray)):
            return False
        return pygpu.gpuarray.may_share_memory(a, b)

@@ -200,11 +200,12 @@ class GpuArrayType(Type):
                self.broadcastable == other.broadcastable)

    def convert_variable(self, var):
-        if (type(self) == type(var.type) and
-                self.typecode == var.type.typecode and
-                self.ndim == var.type.ndim and
+        vt = var.type
+        if (type(self) == type(vt) and
+                self.typecode == vt.typecode and
+                self.ndim == vt.ndim and
                all(sb == ob or ob for sb, ob in zip(self.broadcastable,
-                                                     var.type.broadcastable))):
+                                                     vt.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)

    def __hash__(self):

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -157,24 +157,11 @@ whitelist_flake8 = [
    "sandbox/linalg/ops.py",
    "sandbox/linalg/__init__.py",
    "sandbox/linalg/tests/test_linalg.py",
-    "sandbox/gpuarray/basic_ops.py",
-    "sandbox/gpuarray/nnet.py",
-    "sandbox/gpuarray/elemwise.py",
-    "sandbox/gpuarray/type.py",
    "sandbox/gpuarray/__init__.py",
-    "sandbox/gpuarray/kernel_codegen.py",
-    "sandbox/gpuarray/conv.py",
-    "sandbox/gpuarray/neighbours.py",
    "sandbox/gpuarray/tests/test_subtensor.py",
    "sandbox/gpuarray/tests/test_scan.py",
-    "sandbox/gpuarray/tests/test_neighbours.py",
-    "sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
-    "sandbox/gpuarray/tests/test_type.py",
    "sandbox/gpuarray/tests/test_opt.py",
-    "sandbox/gpuarray/tests/test_blas.py",
    "sandbox/gpuarray/tests/test_elemwise.py",
-    "sandbox/gpuarray/tests/test_nnet.py",
-    "sandbox/gpuarray/tests/test_basic_ops.py",
    "scan_module/scan_utils.py",
    "scan_module/scan_views.py",
    "scan_module/scan.py",