Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu

Merge pull request #3476 from abergeron/move_config
645557f9 · Pascal Lamblin · 1ec1cd9b · 71dea2cf · 645557f9 · 645557f9
--- a/setup.cfg
+++ b/setup.cfg
 [nosetest]
 match=^test
 nocapture=1
+[flake8]
+ignore=E501,E123,E133
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
        theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
-if config.device.startswith('cuda') or config.device.startswith('opencl') or \
+if (config.device.startswith('cuda') or
-        config.gpuarray.init_device != '':
+        config.device.startswith('opencl') or
+        config.init_gpu_device.startswith('cuda') or
+        config.init_gpu_device.startswith('opencl')):
    import theano.sandbox.gpuarray
 # Use config.numpy to call numpy.seterr

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
        self.default = default
        def filter(val):
-            if val.startswith('cpu') or val.startswith('gpu') \
+            if val == self.default or val.startswith('gpu') \
                    or val.startswith('opencl') or val.startswith('cuda'):
                return val
            else:
                raise ValueError(('Invalid value ("%s") for configuration '
                                  'variable "%s". Valid options start with '
-                                  'one of "cpu", "gpu", "opencl", "cuda"'
+                                  'one of "%s", "gpu", "opencl", "cuda"'
-                                  % (val, self.fullname)))
+                                  % (self.default, val, self.fullname)))
        over = kwargs.get("allow_override", True)
        super(DeviceParam, self).__init__(default, filter, over)
    def __str__(self):
-        return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
+        return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)
 AddConfigVar(
    'device',
@@ -94,14 +94,6 @@ AddConfigVar(
     "on it. Do not use upper case letters, only lower case even if "
     "NVIDIA use capital letters."),
    DeviceParam('cpu', allow_override=False),
-    in_c_key=False,)
-AddConfigVar('gpuarray.init_device',
-             """
-             Device to initialize for gpuarray use without moving
-             computations automatically.
-             """,
-             StrParam(''),
    in_c_key=False)
 AddConfigVar(
@@ -110,12 +102,7 @@ AddConfigVar(
     "Unlike 'device', setting this option will NOT move computations, "
     "nor shared variables, to the specified GPU. "
     "It can be used to run GPU-specific tests on a particular GPU."),
-    EnumStr('', 'gpu',
+    DeviceParam('', allow_override=False),
-            'gpu0', 'gpu1', 'gpu2', 'gpu3',
-            'gpu4', 'gpu5', 'gpu6', 'gpu7',
-            'gpu8', 'gpu9', 'gpu10', 'gpu11',
-            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-            allow_override=False),
    in_c_key=False)
 AddConfigVar(
@@ -131,6 +118,112 @@ AddConfigVar(
    in_c_key=False)
+def default_cuda_root():
+    v = os.getenv('CUDA_ROOT', "")
+    if v:
+        return v
+    s = os.getenv("PATH")
+    if not s:
+        return ''
+    for dir in s.split(os.path.pathsep):
+        if os.path.exists(os.path.join(dir, "nvcc")):
+            return os.path.split(dir)[0]
+    return ''
+AddConfigVar(
+    'cuda.root',
+    """directory with bin/, lib/, include/ for cuda utilities.
+       This directory is included via -L and -rpath when linking
+       dynamically compiled modules.  If AUTO and nvcc is in the
+       path, it will use one of nvcc parent directory.  Otherwise
+       /usr/local/cuda will be used.  Leave empty to prevent extra
+       linker directives.  Default: environment variable "CUDA_ROOT"
+       or else "AUTO".
+       """,
+    StrParam(default_cuda_root),
+    in_c_key=False)
+def filter_nvcc_flags(s):
+    assert isinstance(s, str)
+    flags = [flag for flag in s.split(' ') if flag]
+    if any([f for f in flags if not f.startswith("-")]):
+        raise ValueError(
+            "Theano nvcc.flags support only parameter/value pairs without"
+            " space between them. e.g.: '--machine 64' is not supported,"
+            " but '--machine=64' is supported. Please add the '=' symbol."
+            " nvcc.flags value is '%s'" % s)
+    return ' '.join(flags)
+AddConfigVar('nvcc.flags',
+             "Extra compiler flags for nvcc",
+             ConfigParam("", filter_nvcc_flags),
+             # Not needed in c key as it is already added.
+             # We remove it as we don't make the md5 of config to change
+             # if theano.sandbox.cuda is loaded or not.
+             in_c_key=False)
+AddConfigVar('nvcc.compiler_bindir',
+             "If defined, nvcc compiler driver will seek g++ and gcc"
+             " in this directory",
+             StrParam(""),
+             in_c_key=False)
+AddConfigVar('nvcc.fastmath',
+             "",
+             BoolParam(False),
+             # Not needed in c key as it is already added.
+             # We remove it as we don't make the md5 of config to change
+             # if theano.sandbox.cuda is loaded or not.
+             in_c_key=False)
+AddConfigVar('gpuarray.sync',
+             """If True, every op will make sure its work is done before
+                returning.  Setting this to True will slow down execution,
+                but give much more accurate results in profiling.""",
+             BoolParam(False),
+             in_c_key=True)
+AddConfigVar('dnn.conv.workmem',
+             "This flag is deprecated; use dnn.conv.algo_fwd.",
+             EnumStr(''),
+             in_c_key=False)
+AddConfigVar('dnn.conv.workmem_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd.",
+             EnumStr(''),
+             in_c_key=False)
+AddConfigVar('dnn.conv.algo_fwd',
+             "Default implementation to use for CuDNN forward convolution.",
+             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+AddConfigVar('dnn.conv.algo_bwd',
+             "Default implementation to use for CuDNN backward convolution.",
+             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+def default_dnn_path(suffix):
+    def f(suffix=suffix):
+        if config.cuda.root == '':
+            return ''
+        return os.path.join(config.cuda.root, suffix)
+    return f
+AddConfigVar('dnn.include_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(default_dnn_path('include')))
+AddConfigVar('dnn.library_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(default_dnn_path('lib64')))
 # This flag determines whether or not to raise error/warning message if
 # there is a CPU Op in the computational graph.
 AddConfigVar(

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -102,7 +102,7 @@ def change_flags(**kwargs):
                l = [v for v in theano.configparser._config_var_list
                     if v.fullname == k]
                assert len(l) == 1
-                old_val[k] = l[0].__get__()
+                old_val[k] = l[0].__get__(True, None)
            try:
                for k in kwargs:
                    l = [v for v in theano.configparser._config_var_list
@@ -167,7 +167,7 @@ def _config_print(thing, buf):
    for cv in _config_var_list:
        print(cv, file=buf)
        print("    Doc: ", cv.doc, file=buf)
-        print("    Value: ", cv.__get__(), file=buf)
+        print("    Value: ", cv.__get__(True, None), file=buf)
        print("", file=buf)
@@ -182,7 +182,7 @@ def get_config_md5():
    all_opts = sorted([c for c in _config_var_list if c.in_c_key],
                      key=lambda cv: cv.fullname)
    return theano.gof.utils.hash_from_code('\n'.join(
-        ['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts]))
+        ['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))
 class TheanoConfigParser(object):
@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
        # Trigger a read of the value from config files and env vars
        # This allow to filter wrong value from the user.
        if not callable(configparam.default):
-            configparam.__get__()
+            configparam.__get__(root, type(root))
        else:
            # We do not want to evaluate now the default value
            # when it is a callable.
            try:
                fetch_val_for_key(configparam.fullname)
                # The user provided a value, filter it now.
-                configparam.__get__()
+                configparam.__get__(root, type(root))
            except KeyError:
                pass
        setattr(root.__class__, sections[0], configparam)
@@ -294,6 +294,7 @@ class ConfigParam(object):
        self.default = default
        self.filter = filter
        self.allow_override = allow_override
+        self.is_default = True
        # N.B. --
        # self.fullname  # set by AddConfigVar
        # self.doc       # set by AddConfigVar
@@ -304,16 +305,19 @@ class ConfigParam(object):
        # Calling `filter` here may actually be harmful if the default value is
        # invalid and causes a crash or has unwanted side effects.
-    def __get__(self, *args):
+    def __get__(self, cls, type_):
+        if cls is None:
+            return self
        if not hasattr(self, 'val'):
            try:
                val_str = fetch_val_for_key(self.fullname)
+                self.is_default = False
            except KeyError:
                if callable(self.default):
                    val_str = self.default()
                else:
                    val_str = self.default
-            self.__set__(None, val_str)
+            self.__set__(cls, val_str)
        # print "RVAL", self.val
        return self.val

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -1171,7 +1171,7 @@ def apply_meth(tag):
            code = self.code_sections[tag]
            define_macros, undef_macros = self.get_c_macros(node, name)
-            return os.linesep.join([define_macros, code,
+            return os.linesep.join(['', define_macros, code,
                                    undef_macros])
        else:
            raise utils.MethodNotDefined(
@@ -1428,7 +1428,7 @@ class COp(Op):
            def_macros, undef_macros = self.get_c_macros(node, name)
            def_sub, undef_sub = self.get_sub_macros(sub)
-            return os.linesep.join([def_macros, def_sub,
+            return os.linesep.join(['', def_macros, def_sub,
                                    op_code,
                                    undef_sub, undef_macros])
        else:
@@ -1442,17 +1442,21 @@ class COp(Op):
            define_macros, undef_macros = self.get_c_macros(node, name,
                                                            check_input=False)
+            ctx = ""
+            if 'context' in sub:
+                ctx = ", %s" % (sub['context'],)
            # Generate the C code
            return """
                %(define_macros)s
                {
-                  if (%(func_name)s(%(func_args)s) != 0) {
+                  if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
                    %(fail)s
                  }
                }
                %(undef_macros)s
                """ % dict(func_name=self.func_name,
-                           fail=sub['fail'],
+                           fail=sub['fail'], ctx=ctx,
                           func_args=self.format_c_function_args(inp, out),
                           define_macros=define_macros,
                           undef_macros=undef_macros)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
 # import dependency. So we also test it in the file theano/__init__.py
 if config.device.startswith('gpu'):
    use(device=config.device, force=config.force_device, test_driver=False)
-elif config.init_gpu_device:
+elif config.init_gpu_device.startswith('gpu'):
    assert config.device == "cpu", (
        "We can use the Theano flag init_gpu_device"
        " only when the Theano flag device=='cpu'")

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
-import theano.sandbox.dnn_flags
 def dnn_available():
    if dnn_available.avail is None:
@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  return 1;
 }
 """
+            params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+            if config.dnn.include_path:
+                params.append("-I" + config.dnn.include_path)
+            if config.dnn.library_path:
+                params.append("-L" + config.dnn.library_path)
            # Do not run here the test program. It would run on the
            # default gpu, not the one selected by the user. If mixed
            # GPU are installed or if the GPUs are configured in
            # exclusive mode, this cause bad detection.
            comp, out, err = NVCC_compiler.try_flags(
-                ["-l", "cudnn", "-I" + os.path.dirname(__file__),
+                params=params, preambule=preambule, body=body,
-                 "-I" + config.dnn.include_path,
-                 "-L" + config.dnn.library_path],
-                preambule=preambule, body=body,
                try_run=False, output=True)
            dnn_available.avail = comp

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -8,6 +8,7 @@ import warnings
 import numpy
+from theano import config
 from theano.compat import decode, decode_iter
 from theano.gof import local_bitwidth
 from theano.gof.utils import hash_from_file
@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
 _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
-from theano.configparser import (config, AddConfigVar, StrParam,
-                                 BoolParam, ConfigParam)
-AddConfigVar('nvcc.compiler_bindir',
-             "If defined, nvcc compiler driver will seek g++ and gcc"
-             " in this directory",
-             StrParam(""),
-             in_c_key=False)
-user_provided_cuda_root = True
-def default_cuda_root():
-    global user_provided_cuda_root
-    v = os.getenv('CUDA_ROOT', "")
-    user_provided_cuda_root = False
-    if v:
-        return v
-    return find_cuda_root()
-AddConfigVar('cuda.root',
-        """directory with bin/, lib/, include/ for cuda utilities.
-        This directory is included via -L and -rpath when linking
-        dynamically compiled modules.  If AUTO and nvcc is in the
-        path, it will use one of nvcc parent directory.  Otherwise
-        /usr/local/cuda will be used.  Leave empty to prevent extra
-        linker directives.  Default: environment variable "CUDA_ROOT"
-        or else "AUTO".
-        """,
-        StrParam(default_cuda_root),
-        in_c_key=False)
-def filter_nvcc_flags(s):
-    assert isinstance(s, str)
-    flags = [flag for flag in s.split(' ') if flag]
-    if any([f for f in flags if not f.startswith("-")]):
-        raise ValueError(
-            "Theano nvcc.flags support only parameter/value pairs without"
-            " space between them. e.g.: '--machine 64' is not supported,"
-            " but '--machine=64' is supported. Please add the '=' symbol."
-            " nvcc.flags value is '%s'" % s)
-    return ' '.join(flags)
-AddConfigVar('nvcc.flags',
-             "Extra compiler flags for nvcc",
-             ConfigParam("", filter_nvcc_flags),
-             # Not needed in c key as it is already added.
-             # We remove it as we don't make the md5 of config to change
-             # if theano.sandbox.cuda is loaded or not.
-             in_c_key=False)
-AddConfigVar('nvcc.fastmath',
-             "",
-             BoolParam(False),
-             # Not needed in c key as it is already added.
-             # We remove it as we don't make the md5 of config to change
-             # if theano.sandbox.cuda is loaded or not.
-             in_c_key=False)
 nvcc_path = 'nvcc'
 nvcc_version = None
@@ -115,14 +55,6 @@ def is_nvcc_available():
            return False
-def find_cuda_root():
-    s = os.getenv("PATH")
-    if not s:
-        return
-    for dir in s.split(os.path.pathsep):
-        if os.path.exists(os.path.join(dir, "nvcc")):
-            return os.path.split(dir)[0]
 rpath_defaults = []
@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
        # provided an cuda.root flag, we need to add one, but
        # otherwise, we don't add it. See gh-1540 and
        # https://wiki.debian.org/RpathIssue for details.
-        if (user_provided_cuda_root and
+        if (not type(config.cuda).root.is_default and
            os.path.exists(os.path.join(config.cuda.root, 'lib'))):
            rpaths.append(os.path.join(config.cuda.root, 'lib'))

--- a/theano/sandbox/dnn_flags.py
+++ b/theano/sandbox/dnn_flags.py
-"""
-This module contains the configuration flags for cudnn support.
-Those are shared between the cuda and gpuarray backend which is why
-they are in this file.
-"""
-import os.path
-from theano.configparser import AddConfigVar, EnumStr, StrParam
-from theano import config
-AddConfigVar('dnn.conv.workmem',
-             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
-             in_c_key=False)
-AddConfigVar('dnn.conv.workmem_bwd',
-             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
-             in_c_key=False)
-AddConfigVar('dnn.conv.algo_fwd',
-             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-AddConfigVar('dnn.conv.algo_bwd',
-             "Default implementation to use for CuDNN backward convolution.",
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-AddConfigVar('dnn.include_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
-AddConfigVar('dnn.library_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -19,13 +19,6 @@ try:
 except ImportError:
    pygpu = None
-AddConfigVar('gpuarray.sync',
-             """If True, every op will make sure its work is done before
-                returning.  Setting this to True will slow down execution,
-                but give much more accurate results in profiling.""",
-             BoolParam(False),
-             in_c_key=True)
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)
@@ -57,8 +50,9 @@ if pygpu:
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
-        elif config.gpuarray.init_device != '':
+        elif (config.init_gpu_device.startswith('cuda') or
-            init_dev(config.gpuarray.init_device)
+              config.init_gpu_device.startswith('opencl')):
+            init_dev(config.init_gpu_device)
        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
@@ -70,7 +64,8 @@ if pygpu:
    except Exception:
        error("Could not initialize pygpu, support disabled", exc_info=True)
 else:
-    if (config.gpuarray.init_device != '' or
+    if (config.init_gpu_device.startswith('cuda') or
+            config.init_gpu_device.startswith('opencl') or
            config.device.startswith('opencl') or
            config.device.startswith('cuda')):
        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -2,11 +2,9 @@ import os
 import numpy
-import theano
+from theano import Op, Apply, Type, Variable
-from theano import Op, Apply
+from theano import tensor, config
-from theano import tensor, scalar, config
 from theano.gradient import grad_undefined
-from theano.scalar import Scalar
 from theano.tensor.basic import Alloc, Join, Split
 from theano.gof import HideC
@@ -17,7 +15,7 @@ from six.moves import xrange
 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass
@@ -293,7 +291,6 @@ class GpuFromHost(Op):
    def perform(self, node, inp, out):
        x, = inp
        z, = out
-        type = node.outputs[0].type
        z[0] = gpuarray.array(x)
    def grad(self, inputs, grads):
@@ -312,254 +309,29 @@ class GpuFromHost(Op):
    def c_code(self, node, name, inputs, outputs, sub):
        return """
+        PyGpuArrayObject *%(name)s_tmp;
+        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
+        if (%(name)s_tmp == NULL)
+          %(fail)s
        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
+        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
-                                     get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
+                                     get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
-                                     PyArray_NDIM(%(inp)s),
+                                     PyArray_NDIM(%(name)s_tmp),
-                                     (size_t *)PyArray_DIMS(%(inp)s),
+                                     (size_t *)PyArray_DIMS(%(name)s_tmp),
-                                     (ssize_t *)PyArray_STRIDES(%(inp)s),
+                                     (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
                                     pygpu_default_context(),
                                     Py_None);
-        if (%(out)s == NULL) {
+        Py_DECREF(%(name)s_tmp);
+        if (%(out)s == NULL)
          %(fail)s
-        }
        """ % {'name': name, 'inp': inputs[0],
               'out': outputs[0], 'fail': sub['fail']}
-    def c_code_cache_version(self):
-        return (4,)
-gpu_from_host = GpuFromHost()
-class GpuFromCuda(Op):
-    view_map = {0: [0]}
-    __props__ = ()
-    def make_node(self, x):
-        from theano.sandbox.cuda import CudaNdarrayType
-        if not isinstance(x.type, CudaNdarrayType):
-            raise TypeError(x)
-        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
-                                              dtype=x.dtype)()])
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        z[0] = gpuarray.array(numpy.asarray(x))
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [cuda_from_gpu(gz)]
-    def R_op(self, inputs, eval_points):
-        ev, = eval_points
-        if isinstance(ev, GpuArrayType):
-            return [cuda_from_gpu(ev)]
-        else:
-            return ev
-    def infer_shape(self, node, xshp):
-        return xshp
-    def c_headers(self):
-        return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>',
-                '<gpuarray/types.h>', '<cuda.h>']
-    def c_header_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'include'))
-        return ret
-    def c_lib_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'lib'))
-        return ret
-    def c_libraries(self):
-        return ['cudart', 'cublas', 'cuda']
-    def c_support_code(self):
-        return """
-        CUcontext (*cuda_get_ctx)(void *ctx);
-        gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
-        """
-    def c_init_code(self):
-        return ['cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");',
-                'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");']
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        int %(name)serr;
-        gpudata *%(name)sdata;
-        CUcontext %(name)scur;
-        size_t *%(name)sdims;
-        ssize_t *%(name)sstr;
-        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
-            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
-            %(fail)s
-        }
-        %(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
-        if (%(name)sdims == NULL) {
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
-            %(fail)s
-        }
-        %(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
-        if (%(name)sstr == NULL) {
-            free(%(name)sdims);
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
-            %(fail)s
-        }
-        for (unsigned int i = 0; i < %(in)s->nd; i++) {
-            %(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
-            %(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
-        }
-        %(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
-                                     (CUdeviceptr)%(in)s->devdata,
-                                     ((size_t)%(in)s->data_allocated)*4);
-        if (%(name)sdata == NULL) {
-            Py_DECREF(%(out)s);
-            free(%(name)sdims);
-            free(%(name)sstr);
-            PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
-            %(fail)s
-        }
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
-                                    %(name)sdims, %(name)sstr,
-                                    pygpu_default_context(), 1,
-                                    (PyObject *)%(in)s,
-                                    (PyObject *)&PyGpuArrayType);
-        pygpu_default_context()->ops->buffer_release(%(name)sdata);
-        free(%(name)sdims);
-        free(%(name)sstr);
-        if (%(out)s == NULL) {
-            %(fail)s
-        }
-        """ % {'name': name, 'in': inputs[0], 'out': outputs[0],
-               'fail': sub['fail']}
    def c_code_cache_version(self):
        return (5,)
-gpu_from_cuda = GpuFromCuda()
+gpu_from_host = GpuFromHost()
-class CudaFromGpu(Op):
-    view_map = {0: [0]}
-    __props__ = ()
-    def make_node(self, x):
-        from theano.sandbox.cuda import CudaNdarrayType
-        if not isinstance(x.type, GpuArrayType):
-            raise TypeError(x)
-        if x.type.dtype != 'float32':
-            raise TypeError(x)
-        return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
-    def perform(self, node, inp, out):
-        from theano.sandbox.cuda import filter as cuda_filter
-        x, = inp
-        z, = out
-        z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
-                           tuple([0] * x.ndim), 0, z[0])
-    def grad(self, inputs, grads):
-        gz, = grads
-        return [gpu_from_cuda(gz)]
-    def R_op(self, inputs, eval_points):
-        from theano.sandbox.cuda import CudaNdarrayType
-        ev, = eval_points
-        if (isinstance(ev, CudaNdarrayType)):
-            return [gpu_from_cuda(ev)]
-        else:
-            return [ev]
-    def infer_shape(self, node, shp):
-        return shp
-    def c_headers(self):
-        return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>', '<cuda.h>']
-    def c_header_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'include'))
-        return ret
-    def c_lib_dirs(self):
-        import cuda_ndarray
-        ret = [os.path.dirname(cuda_ndarray.__file__)]
-        cuda_root = config.cuda.root
-        if cuda_root:
-            ret.append(os.path.join(cuda_root, 'lib'))
-        return ret
-    def c_libraries(self):
-        return ['cudart', 'cublas', 'cuda']
-    def c_support_code(self):
-        return """
-        CUcontext (*cuda_get_ctx)(void *ctx);
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """
-    def c_init_code(self):
-        return ['cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");',
-                'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        int %(name)serr = 0, %(name)si;
-        CUcontext %(name)scur;
-        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
-            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
-            %(fail)s
-        }
-        if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
-            %(fail)s
-        }
-        Py_XDECREF(%(out)s);
-        %(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
-        if (!%(out)s) {
-            %(fail)s
-        }
-        for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
-            CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
-            CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
-        }
-        %(name)serr = CudaNdarray_set_device_data(%(out)s,
-          (float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
-                                          (PyObject *)%(inp)s);
-        if (%(name)serr) {
-           %(fail)s
-        }
-        """ % {'name': name, 'inp': inputs[0], 'out': outputs[0],
-               'fail': sub['fail']}
-    def c_code_cache_version(self):
-        return (3,)
-cuda_from_gpu = CudaFromGpu()
 class GpuAlloc(HideC, Alloc):
@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
        sh, bcast = self.validate_shape(shape)
        if value.ndim > len(sh):
            TypeError("The GpuAlloc value to use has more dimensions "
-                      "than the specified shape", v.ndim, len(sh))
+                      "than the specified shape", value.ndim, len(sh))
        otype = value.type.clone(broadcastable=bcast)
        return Apply(self, [value] + sh, [otype()])
@@ -686,14 +458,14 @@ class GpuAlloc(HideC, Alloc):
        return (2,)
    def do_constant_folding(self, node):
+        from . import subtensor, blas
        for client in node.outputs[0].clients:
            if client[0] == 'output':
                # If the output is a constant, it will have to be deepcopied
                # each time the function is called.  So we do not fold.
                return False
-            elif (  # The following ops work inplace of their input id 0.
+            # The following ops work inplace of their input id 0.
-                  client[1] == 0 and
+            elif (client[1] == 0 and
-                  isinstance(client[0].op, (
                  # Ops that will work inplace on the Alloc. So if they
                  # get constant_folded, they would copy the
                  # constant and this is less efficients.
@@ -701,14 +473,13 @@ class GpuAlloc(HideC, Alloc):
                  # Not doing the constant folding could also lower
                  # the peak memory usage, as we the "constant" won't
                  # always exists.
-                      # theano.tensor.subtensor.AdvancedIncSubtensor,
+                  isinstance(client[0].op,
-                      theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
+                             (subtensor.GpuIncSubtensor,
-                      theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
+                              subtensor.GpuAdvancedIncSubtensor1,
-                      theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20,
+                              subtensor.GpuAdvancedIncSubtensor1_dev20,
-                      theano.sandbox.gpuarray.blas.GpuGemm,
+                              blas.GpuGemm, blas.GpuGemv,
-                      theano.sandbox.gpuarray.blas.GpuGemv,
+                              blas.GpuGer)
-                      theano.sandbox.gpuarray.blas.GpuGer,
+                             )):
-                  ))):
                return False
            # If the clients is a transfer, we don't want to fold. We
            # let the moving opt finish before deciding what to do.
@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
                code=code, name="k",
                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
                flags=Kernel.get_flags(self.dtype),
-                objvar='k_eye_'+name,
+                objvar='k_eye_' + name)]
-                )]
    def c_code(self, node, name, inp, out, sub):
        n, m = inp

--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -5,17 +5,15 @@ import theano
 from theano import config, gof
 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass
-from six.moves import reduce
-from .comp import NVCC_compiler
 from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
 from theano.gof import utils
 class GpuConv(GpuKernelBase, gof.Op):
    """
    Implement the batched and stacked 2d convolution on the gpu.
@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
        nb = 0
        if self.kshp is not None:
            nb = self.kshp[1]
-        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
+        return ['-DTHEANO_KERN_WID=' + str(nb)]
    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
+        return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
-            raise MethodNotDefined('cuda only')
-        return ['<stdint.h>', '<stdio.h>', 'cuda.h',
-                '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 21)
+        return (0, 22)
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
    def c_code(self, node, nodename, inp, out_, sub):
        img, kern = inp

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -26,10 +26,7 @@ from .conv import GpuConv
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
 from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
-from .opt_util import alpha_merge, output_merge
+from .opt_util import alpha_merge, output_merge, inplace_allocempty
-# We need to import this to define the flags.
-from theano.sandbox import dnn_flags  # noqa
 def dnn_available():
@@ -50,7 +47,6 @@ def dnn_available():
        dnn_available.avail = False
    preambule = """
 #include <stdio.h>
-#include <cuda.h>
 #include <cudnn.h>
 #include <cudnn_helper.h>
 """
@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  return 1;
 }
 """
+    params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+    if config.dnn.include_path:
+        params.append("-I" + config.dnn.include_path)
+    if config.dnn.library_path:
+        params.append("-L" + config.dnn.library_path)
    # Do not run here the test program. It would run on the
    # default gpu, not the one selected by the user. If mixed
    # GPU are installed or if the GPUs are configured in
    # exclusive mode, this cause bad detection.
    comp, out, err = GCC_compiler.try_flags(
-        ["-l", "cudnn", "-I" + os.path.dirname(__file__),
+        params, preambule=preambule, body=body,
-         "-I" + config.dnn.include_path,
-         "-L" + config.dnn.library_path],
-        preambule=preambule, body=body,
        try_run=False, output=True)
    dnn_available.avail = comp
@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
                       'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
-@local_optimizer([GpuDnnConv], inplace=True)
+@inplace_allocempty(GpuDnnConv, 2)
-def local_dnn_conv_inplace(node):
+def local_dnn_conv_inplace(node, inputs):
-    if type(node.op) != GpuDnnConv or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
-@local_optimizer([GpuDnnConvGradW], inplace=True)
+@inplace_allocempty(GpuDnnConvGradW, 2)
-def local_dnn_convgw_inplace(node):
+def local_dnn_convgw_inplace(node, inputs):
-    if type(node.op) != GpuDnnConvGradW or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
-@local_optimizer([GpuDnnConvGradI], inplace=True)
+@inplace_allocempty(GpuDnnConvGradI, 2)
-def local_dnn_convgi_inplace(node):
+def local_dnn_convgi_inplace(node, inputs):
-    if type(node.op) != GpuDnnConvGradI or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    dest = inputs[2]
-    if (dest.owner and
-            isinstance(dest.owner.op, GpuAllocEmpty) and
-            len(dest.clients) > 1):
-        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,
                                 local_dnn_convgi_inplace,
-                                 name="local_dnn_conv_inplace"),
+                                 name="local_dnna_conv_inplace"),
               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
 @register_opt('cudnn')
-@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
 def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
 def local_dnn_convw_alpha_merge(node, *inputs):
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
+@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
 def local_dnn_convi_alpha_merge(node, *inputs):
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
-@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
+@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
 from __future__ import print_function
 import copy
-import os
 from theano.compat import izip
 import numpy
-import theano
 from theano import Apply, scalar, config
 from theano import scalar as scal
 from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
-from theano.gof.cmodule import GCC_compiler
 from theano.scalar import Scalar
 from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(node.outputs) if not n in self.inplace_pattern]
+                enumerate(node.outputs) if n not in self.inplace_pattern]
        scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
        fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            else:
                scal_out.append(arg.name + '[i]')
-        kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
+        kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
                                    scal_in, scal_out,
                                    dict(fail='return;'))
@@ -171,25 +168,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                        ("npy_float64", "ga_double"),
                        ]:
            kop = kop.replace(npy, ga)
-        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
+        return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-    def c_compiler(self):
-        return GCC_compiler
    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
    def c_support_code(self):
        return self.scalar_op.c_support_code()
@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                                              node.outputs[0].type.dtype),
                       objvar='elem_%d_%s' % (nd, nodename))]
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
    def c_code(self, node, name, inputs, outputs, sub):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                         GpuKernel_error(&%(kname)s, err));
            %(fail)s;
        }
-        """ % dict(kname=kname,fail=fail)
+        """ % dict(kname=kname, fail=fail)
        if config.gpuarray.sync:
            code += """
            err = GpuArray_sync(&%(z)s->ga);
@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (3, ver)
+            return (4, ver)
        else:
            return ver
@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        res = input
-        res = res.transpose(self.shuffle+self.drop)
+        res = res.transpose(self.shuffle + self.drop)
        shape = list(res.shape[:len(self.shuffle)])
        for augm in self.augment:
@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
            Py_DECREF(tmp);
            return res;
        }
-        """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
+        """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle + self.drop)),
                   name=name, nd_out=len(self.new_order),
                   copy_shape=copy_shape(len(self.new_order)))
@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        return process
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
 class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if self.pre_scalar_op:
            # Currently we only tested pre_scalar_op that don't cause
            # upcast.
-            d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
-            assert d1.dtype == ret.outputs[0].dtype
            assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
        if self.reduce_mask is None:
            if self.axis is None:
@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            return False
        return True
-    def c_header_dirs(self):
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
    def c_code(self, node, name, inp, out, sub):
        x, = inp
@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        sio = StringIO()
        fail = sub['fail']
+        ctx = sub['context']
        # check input
        print("""
@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
                                %(out_typecode)s, GA_C_ORDER,
-                                pygpu_default_context(),
+                                pygpu_default_context(), Py_None);
-                                Py_None);
            if (NULL == %(z)s)
            {
                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
            # TODO: check if we are ccontiguous when we un-dimshuffle
            # TODO: if only some dims are ccontiguous, call version with less dims.
-            print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals(), file=sio)
+            print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
+                  file=sio)
            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
            print("}else{", file=sio)
-            getattr(self, 'c_code_reduce_%s'%(''.join(
+            getattr(self, 'c_code_reduce_%s' %
-                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+                    (''.join(str(i) for i in self.reduce_mask)))(
+                sio, node, name, x, z, fail)
            print("}", file=sio)
        else:
-            getattr(self, 'c_code_reduce_%s'%(''.join(
+            getattr(self, 'c_code_reduce_%s' % (''.join(
                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
        # \end bracket the reduction ...
@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        else:
            assert isinstance(self.scalar_op, (scal.Maximum,
                                               scal.Minimum))
-            if self.pre_scalar_op:  # TODO, multi_dtype!
+            if self.pre_scalar_op:  # TODO: multiple dtypes
-                #dtype = node.inputs[0].dtype
+                # dtype = node.inputs[0].dtype
                dtype = 'float32'
                dummy_var = scal.Scalar(dtype=dtype)()
@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_cache_version_apply(self, node):
-        version = [16]  # the version corresponding to the c code in this Op
+        version = [17]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
-        scalar_node = Apply(self.scalar_op,
-                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
-                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
        version.extend(self.scalar_op.c_code_cache_version())
        for i in node.inputs + node.outputs:
            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        in_dtype = node.inputs[0].dtype
        out_dtype = node.outputs[0].dtype
        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
-        flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
+        flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
        in_type = gpuarray.dtype_to_ctype(in_dtype)
        out_type = gpuarray.dtype_to_ctype(out_dtype)
        acc_type = gpuarray.dtype_to_ctype(acc_dtype)
@@ -2106,7 +2071,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                ]
            kernels.append(Kernel(code=sio.getvalue(), name=kname,
                                  params=params, flags=flags, objvar=k_var))
-        #01, 011, 0111
+        # 01, 011, 0111
        if (0 == self.reduce_mask[0] and
                all(self.reduce_mask[1:]) and
                nd_in in[2, 3, 4]):
@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            # this kernel uses one block for multiple column(up to 32TODO),
            # threads per block for each element per column.
-# thread.x = dim 2 contiguous
+            # thread.x = dim 2 contiguous
-# thread.y = dim 1
+            # thread.y = dim 1
-# block.x = dim 0
+            # block.x = dim 0
-# block.y = dim 1 rest
+            # block.y = dim 1 rest
            init = self._k_init(node, nodename)
            decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
            kname = "kernel_reduce_1011"
-            k_var= "kernel_reduce_1011_" + nodename
+            k_var = "kernel_reduce_1011_" + nodename
            sio = StringIO()
            print("""
            KERNEL void %(kname)s(
@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
                                              acc_dtype,
                                              node.outputs[0].type.dtype),
-                       objvar='k_reduk_'+name)]
+                       objvar='k_reduk_' + name)]
    def c_code(self, node, name, inp, out, sub):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
@@ -2768,7 +2733,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        if (%(sync)d)
            GpuArray_sync(&%(out)s->ga);
-""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
+        """ % dict(out=out[0], inp=inp[0], fail=sub['fail'],
                   sync=bool(config.gpuarray.sync))
        k = self.get_kernel_cache(node)
        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
@@ -2816,7 +2781,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                 %(fail)s
             }
         }
-""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
+        """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        else:
            code += """
@@ -2828,7 +2793,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s
            }
        }
-""" % dict(output=output, fail=sub['fail'],
+        """ % dict(output=output, fail=sub['fail'],
                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        if acc_dtype != node.outputs[0].type.dtype:
@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
                          Py_None);
        if (!tmp) %(fail)s
-""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
+        """ % dict(output=output, fail=sub['fail'],
+                   acc_type=dtype_to_typecode(acc_dtype))
        else:
            code += """
        tmp = %(output)s;
        Py_INCREF(tmp);
-""" % dict(output=output)
+        """ % dict(output=output)
        # We need the proxies since we are passing a pointer to the
        # data into the call and therefore we need a real copy of the
@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        code += """
        args[0] = &n;
        args[1] = tmp->ga.data;
-""" % dict(output=output)
+        """ % dict(output=output)
        p = 2
        for i in range(node.inputs[0].ndim):
@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
        args[%(p)s] = &proxy_dim[%(i)s];
        n *= %(input)s->ga.dimensions[%(i)s];
-""" % dict(i=i, p=p, input=input)
+        """ % dict(i=i, p=p, input=input)
            p += 1
            if not redux[i]:
                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        args[%(p)s] = %(input)s->ga.data;
        proxy_off = %(input)s->ga.offset;
        args[%(p)s+1] = &proxy_off;
-""" % dict(p=p, input=input)
+        """ % dict(p=p, input=input)
        p += 2
        for i in range(node.inputs[0].ndim):
            code += """
        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
        args[%(p)s] = &proxy_str[%(i)s];
-""" % dict(p=p, i=i, input=input)
+        """ % dict(p=p, i=i, input=input)
            p += 1
        code += """
@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s
            }
        }
-""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
+        """ % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
                   ls=ls, fail=sub['fail'], output=output, input=input,
                   cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
        return code
    def c_code_cache_version(self):
-        return (1, self.GpuKernelBase_version)
+        return (2, self.GpuKernelBase_version)
    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):
@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            redux = self.redux
        if any(redux):
-            output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
+            output[0] = self.get_kernel_cache(node)(input).astype(
-                                             dtype=node.outputs[0].type.dtype)
+                copy=False, dtype=node.outputs[0].type.dtype)
        else:
            output[0] = pygpu.gpuarray.array(input, copy=True,
                                             dtype=node.outputs[0].type.dtype)

--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
 """
 try:
-    import pygpu
    from pygpu import gpuarray
 except ImportError:
    pass
 def nvcc_kernel(name, params, body):
    """
    Return the c code of a kernel function.
@@ -174,9 +174,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    """
    ctype = gpuarray.dtype_to_ctype(dtype)
-    return [
    # get max of buf (trashing all but buf[0])
-            inline_reduce_max(N, buf, threadPos, threadCount),
+    return [inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
-import os
 import numpy
 from theano import Op, Apply, config
-from theano.gof import local_optimizer
 from theano.tensor.nnet.neighbours import Images2Neibs
 import theano.tensor as T
 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass
-from .basic_ops import (as_gpuarray_variable,
+from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
-                        host_from_gpu, gpu_from_host,
-                        GpuKernelBase, Kernel)
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType
-from .comp import NVCC_compiler
 class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                   dtype=ten4.type.dtype)()])
    def c_code_cache_version(self):
-        return (10,1)
+        return (11,)
    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
-    def c_init_code(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['setup_ext_cuda();']
    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
 @opt.register_opt()
-@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
 def local_gemm16_alpha_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]
 @opt.register_opt()
-@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
 def local_gemm16_output_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]

--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
 from __future__ import print_function
 import numpy
-import os
 from theano import Op, Apply, config
 from six import StringIO
 try:
    import pygpu
-    from pygpu import gpuarray, elemwise
+    from pygpu import gpuarray
 except ImportError:
    pass
@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-                '<gpuarray/types.h>']
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        return sio.getvalue()
    def c_code_cache_version(self):
-        return (7,)
+        return (8,)
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return Apply(self, [dnll, sm, y_idx], [sm.type()])
    def c_code_cache_version(self):
-        return (10,)
+        return (11,)
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-                '<gpuarray/types.h>']
    def c_code(self, node, nodename, inp, out, sub):
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
        return shape
    def c_code_cache_version(self):
-        return (14,) + inline_softmax.code_version
+        return (15,) + inline_softmax.code_version
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
    def c_code(self, node, nodename, inp, out, sub):
        dtype_x = node.inputs[0].dtype
@@ -665,15 +637,15 @@ class GpuSoftmax(GpuKernelBase, Op):
            ]
        kernels = []
        kname = "kSoftmax"
-        k_var= "kSoftmax_" + nodename
+        k_var = "kSoftmax_" + nodename
-        code = nvcc_kernel(kname,
+        code = nvcc_kernel(
+            kname,
            params=['const ga_size M', 'const ga_size N',
                    'const %s * x' % type_x, 'const ga_size offset_x',
                    'const ga_ssize sx0', 'const ga_ssize sx1',
                    '%s * sm' % type_sm, 'const ga_size offset_sm',
                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
+            body=["extern __shared__ %s buf[]" % type_acc,
-                    "extern __shared__ %s buf[]" % type_acc,
                  "%s * buf2 = buf + N" % type_acc,
                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
@@ -696,15 +668,15 @@ class GpuSoftmax(GpuKernelBase, Op):
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        kname = "kSoftmax_fixed_shared"
-        k_var= "kSoftmax_fixed_shared" + nodename
+        k_var = "kSoftmax_fixed_shared" + nodename
-        code = nvcc_kernel(kname,
+        code = nvcc_kernel(
+            kname,
            params=['const ga_size M', 'const ga_size N',
                    'const %s * x' % type_x, 'const ga_size offset_x',
                    'const ga_ssize sx0', 'const ga_ssize sx1',
                    '%s * sm' % type_sm, 'const ga_size offset_sm',
                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
+            body=["extern __shared__ %s buf[]" % type_acc,
-                    "extern __shared__ %s buf[]" % type_acc,
                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                  "for (int blockIDX = blockIdx.x; blockIDX < M;"
@@ -746,23 +718,10 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        return [shape[0]]
    def c_code_cache_version(self):
-        return (13,) + inline_softmax.code_version
+        return (14,) + inline_softmax.code_version
-    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
-    def c_init_code(self):
-        return ['setup_ext_cuda();']
    def c_code(self, node, nodename, inp, out, sub):
        dtype_x = node.inputs[0].dtype
@@ -892,7 +851,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
-        code = nvcc_kernel(kname,
+        code = nvcc_kernel(
+            kname,
            params=['const ga_size M', 'const ga_size N',
                    'const %s * x' % type_x, 'const ga_size offset_x',
                    'const ga_ssize sx0', 'const ga_ssize sx1',
@@ -900,8 +860,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
                    'const ga_ssize sb0',
                    '%s * sm' % type_sm, 'const ga_size offset_sm',
                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
+            body=["extern __shared__ %s buf[]" % type_acc,
-                    "extern __shared__ %s buf[]" % type_acc,
                  "%s * buf2 = buf + N" % type_acc,
                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
                  "b = (const %s *)(((char *)b)+offset_b)" % type_b,
@@ -926,7 +885,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
-        code = nvcc_kernel(kname,
+        code = nvcc_kernel(
+            kname,
            params=['const ga_size M', 'const ga_size N',
                    'const %s * x' % type_x, 'const ga_size offset_x',
                    'const ga_ssize sx0', 'const ga_ssize sx1',
@@ -934,8 +894,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
                    'const ga_ssize sb0',
                    '%s * sm' % type_sm, 'const ga_size offset_sm',
                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
-                body=[
+            body=["extern __shared__ %s buf[]" % type_acc,
-                    "extern __shared__ %s buf[]" % type_acc,
                  "x = (const %s *)(((char *)x)+offset_x)" % type_x,
                  "b = (const %s *)(((char *)b)+offset_b)" % type_b,
                  "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
 @register_opt()
-@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
+@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
 def local_gpuagemm_alpha_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]
 @register_opt()
-@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
+@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
 def local_gpuagemm_output_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)
-from .basic_ops import GpuFromHost, HostFromGpu
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
 from .elemwise import GpuDimShuffle, GpuElemwise
 _one = scal.constant(numpy.asarray(1.0, dtype='float64'))
 def grab_cpu_scalar(v, nd):
+    """
+    Get a scalar variable value from the tree at `v`.
+    This function will dig through transfers and dimshuffles to get
+    the constant value. If no such constant is found, it returns None.
+    Parameters
+    ----------
+    v : variable
+        Theano variable to extract the constant value from.
+    nd : int
+        Expected number of dimensions for the variable (for
+        broadcasted constants).
+    """
    if v.owner is not None:
        n = v.owner
-        if (isinstance(n.op, GpuDimShuffle) and
+        if (isinstance(n.op, (GpuDimShuffle, DimShuffle)) and
-                n.op.new_order == ('x',) * nd):
-            return grab_cpu_scalar(n.inputs[0])
-        elif (isinstance(n.op, DimShuffle) and
                n.op.new_order == ('x',) * nd):
-            return grab_cpu_scalar(n.inputs[0])
+            return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
-        elif isinstance(n.op, GpuFromHost):
+        elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
-            return grab_cpu_scalar(n.inputs[0], nd=nd)
+            return grab_cpu_scalar(n.inputs[0], nd)
        else:
            return None
    else:
@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
 def find_node(v, cls, ignore_clients=False):
-    # This digs through possibly redundant transfers to for the node
+    """
-    # that has the op class specified. If ignore_clients is False (the
+    Find the node that has an op of of type `cls` in `v`.
-    # default) it will only dig through nodes that have a single
-    # client.
+    This digs through possibly redundant transfers to for the node
+    that has the type `cls`. If `ignore_clients` is False (the
+    default) it will only dig through nodes that have a single client
+    to avoid duplicating computations.
+    Parameters
+    ----------
+    v : variable
+        The variable to dig through
+    cls : Op class
+        The type of the node we are looking for
+    ignore_clients : bool, optional
+        Whether to ignore multiple clients or not.
+    """
    if v.owner is not None and (ignore_clients or len(v.clients) == 1):
        if isinstance(v.owner.op, cls):
            return v.owner
@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
 def is_equal(var, val):
-    # Returns True if var is always equal to val (python value), False
+    """
-    # otherwise (including if var is not constant)
+    Returns True if `var` is always equal to `val`.
+    This will only return True if the variable will always be equal to
+    the value.  If it might not be true in some cases then it returns False.
+    Parameters
+    ----------
+    var : variable
+        Variable to compare
+    val : value
+        Python value
+    """
    try:
        v = get_scalar_constant_value(var)
        return v == val
@@ -59,7 +97,57 @@ def is_equal(var, val):
        return False
-def alpha_merge(cls, alpha_in, beta_in, nd):
+def alpha_merge(cls, alpha_in, beta_in):
+    """
+    Decorator to merge multiplication by a scalar on the output.
+    This will find a pattern of scal * <yourop>(some, params, alpha,
+    beta) and update it so that the scalar multiplication happens as
+    part of your op.
+    The op needs to accept an alpha and a beta scalar which act this way:
+       out = Op() * alpha + out_like * beta
+    Where out_like is a buffer that has the same size as the output
+    and gets added to the "real" output of the operation.  An example
+    of an operation that respects this pattern is GEMM from blas.
+    The decorated function must have this signature:
+        maker(node, *inputs)
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    The `*inputs` parameters contains the new inputs for your op.  You
+    MUST use those inputs instead of the ones on `node`.  Note that
+    this function can be as simple as:
+        def maker(node, *inputs):
+            return node.op(*inputs)
+    Parameters
+    ----------
+    cls : op class
+        The class of the op you want to merge
+    alpha_in : int
+        The input index for the alpha scalar for your op (in node.inputs).
+    beta_in : int
+        The input index for the beta scalar for your op (in node.inputs).
+    Returns
+    -------
+    This returns an unregistered local optimizer that has the same
+    name as the decorated function.
+    Notes
+    -----
+    This was factored out since the code to deal with intervening
+    transfers and correctness in the presence of different values of
+    alpha and beta scaling factors is not trivial.
+    """
    def wrapper(maker):
        @local_optimizer([GpuElemwise])
        @wraps(maker)
@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
                targ = find_node(node.inputs[0], cls)
                if targ is None:
                    targ = find_node(node.inputs[1], cls)
-                    lr = grab_cpu_scalar(node.inputs[0], nd=nd)
+                    if targ is None:
+                        return
+                    lr = grab_cpu_scalar(node.inputs[0],
+                                         nd=targ.outputs[0].ndim)
                else:
-                    lr = grab_cpu_scalar(node.inputs[1], nd=nd)
+                    lr = grab_cpu_scalar(node.inputs[1],
-                if (lr is None or targ is None or
+                                         nd=targ.outputs[0].ndim)
-                        lr.dtype != targ.outputs[0].dtype):
+                if lr is None or lr.dtype != targ.outputs[0].dtype:
                    return None
                inputs = list(targ.inputs)
                try:
@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
    return wrapper
-def output_merge(cls, alpha_in, beta_in, out_in, nd):
+def output_merge(cls, alpha_in, beta_in, out_in):
+    """
+    Decorator to merge addition by a value on the output.
+    This will find a pattern of val * <yourop>(some, params, alpha,
+    beta, out_like) and update it so that the addtition happens as
+    part of your op.
+    The op needs to accept an alpha and a beta scalar which act this way:
+       out = Op() * alpha + out_like * beta
+    Where out_like is a buffer that has the same size as the output
+    and gets added to the "real" output of the operation.  An example
+    of an operation that respects this pattern is GEMM from blas.
+    The decorated function must have this signature:
+        maker(node, *inputs)
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    The `*inputs` parameters contains the new inputs for your op.  You
+    MUST use those inputs instead of the ones on `node`.  Note that
+    this function can be as simple as:
+        def maker(node, *inputs):
+            return node.op(*inputs)
+    Parameters
+    ----------
+    cls : op class
+        The class of the op you want to merge
+    alpha_in : int
+        The input index for the alpha scalar for your op (in node.inputs).
+    beta_in : int
+        The input index for the beta scalar for your op (in node.inputs).
+    out_in : int
+        The input index for the out_like input for your op (in node.inputs).
+    Returns
+    -------
+    This returns an unregistered local optimizer that has the same
+    name as the decorated function.
+    Notes
+    -----
+    This was factored out since the code to deal with intervening
+    transfers and correctness in the presence of different values of
+    alpha and beta scaling factors is not trivial.
+    This also correctly handles the case where the added value is
+    broadcasted (by not performing the replacement).
+    """
    def wrapper(maker):
        @local_optimizer([GpuElemwise])
        @wraps(maker)
@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
                return maker(targ, *inputs)
        return opt
    return wrapper
+def inplace_allocempty(op, idx):
+    """
+    Wrapper to make an inplace optimization that deals with AllocEmpty
+    This will duplicate the alloc input if it has more than one client
+    to allow the op to work on it inplace.
+    The decorated function must have this signature:
+        maker(node, inputs)
+    The `node` argument you recieve is the original apply node that
+    contains your op.  You should use it to grab relevant properties
+    for your op so that the new version performs the same computation.
+    You should also switch the op to work inplace.  The `*inputs`
+    parameters contains the new inputs for your op.  You MUST use
+    those inputs instead of the ones on `node`.  Note that this
+    function can be as simple as:
+        def maker(node, inputs):
+            return node.op.__class__(inplace=True)(*inputs)
+    Parameters
+    ----------
+    op : op class
+        The op class to look for to make inplace
+    idx : int
+        The index of the (possibly) AllocEmpty input (in node.inputs).
+    Returns
+    -------
+    This returns an unregistered inplace local optimizer that has the
+    same name as the decorated function.
+    """
+    def wrapper(maker):
+        @local_optimizer([op], inplace=True)
+        @wraps(maker)
+        def opt(node):
+            if type(node.op) != op or node.op.inplace:
+                return
+            inputs = list(node.inputs)
+            alloc = inputs[idx]
+            if (alloc.owner and
+                    isinstance(alloc.owner.op, GpuAllocEmpty) and
+                    len(alloc.clients) > 1):
+                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
+                inputs[idx] = alloc_op(*alloc.owner.inputs)
+            return maker(node, inputs)
+        return opt
+    return wrapper
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    def _f16_ok(self):
        return self.iadd_node.op._f16_ok
-    def c_header_dirs(self):
-        cuda_root = config.cuda.root
-        if cuda_root:
-            return [os.path.join(cuda_root, 'include')]
-        else:
-            return []
    def c_headers(self):
        return self.iadd_node.op.c_headers()
-    def c_compiler(self):
-        return self.iadd_node.op.c_compiler()
    def c_init_code(self):
        return self.iadd_node.op.c_init_code()
@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        elemwise_version = self.iadd_node.c_code_cache_version()
        if not parent_version or not elemwise_version:
            return
-        return parent_version + elemwise_version + (2,)
+        return parent_version + elemwise_version + (3,)
 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
 import unittest
 from theano.compat import izip
-from copy import copy, deepcopy
 from six import iteritems
@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
 # Don't import test classes otherwise they get tested as part of the file
 from theano.tensor.tests import test_basic
 from theano.tensor.tests.test_basic import rand, safe_make_node
+from theano.tests import unittest_tools as utt
 from theano.tests.unittest_tools import SkipTest
 import theano.sandbox.gpuarray
+from ..type import (GpuArrayType,
+                    gpuarray_shared_constructor)
+from ..basic_ops import (
+    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
+    gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
+    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+from ..subtensor import GpuSubtensor
+import theano.sandbox.cuda as cuda_ndarray
+try:
+    from pygpu import gpuarray
+except:
+    pass
 if theano.sandbox.gpuarray.pygpu is None:
    raise SkipTest("pygpu not installed")
 # If you are writing a new test file, don't copy this code, but rather
 # import stuff from this file (like mode_with_gpu) to reuse it.
-import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
    if not cuda_ndarray.use.device_number:
        # We should not enable all the use like the flag device=gpu,
@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
 if not theano.sandbox.gpuarray.pygpu_activated:
    raise SkipTest("pygpu disabled")
-from ..type import (GpuArrayType,
-                    gpuarray_shared_constructor)
-from ..basic_ops import (
-    host_from_gpu, gpu_from_host,
-    gpu_alloc, GpuAlloc,
-    GpuAllocEmpty,
-    gpu_from_cuda,
-    cuda_from_gpu, HostFromGpu,
-    GpuContiguous,
-    GpuFromHost, GpuReshape,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
-from ..subtensor import GpuSubtensor
-from theano.tests import unittest_tools as utt
 utt.seed_rng()
 rng = numpy.random.RandomState(seed=utt.fetch_seed())
-from pygpu import gpuarray
 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
@@ -63,22 +61,6 @@ else:
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
-def may_fail(msg, EClass):
-    """Mark a test that requires very specific conditions to work to
-       mask a specific exception class."""
-    def test_decorator(f):
-        def wrapper():
-            try:
-                f()
-            except Exception as e:
-                if isinstance(e, EClass):
-                    raise SkipTest(msg, e)
-                raise
-        wrapper.__name__ = f.__name__
-        return wrapper
-    return test_decorator
 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
    if mode is None:
@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                    else:
                        err_msg = ("Test %s::%s: exception raised during test "
                                   "call was not the same as the reference "
-                                   "call (got: %s, expected %s)") % \
+                                   "call (got: %s, expected %s)" %
                                   (self.gpu_op, testname, type(exc),
-                                    type(ref_e))
+                                    type(ref_e)))
                        exc.args += (err_msg,)
                        raise
@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                                                        expected):
                    self.fail(("Test %s::%s: Output %s gave the wrong "
                               "value. With inputs %s, expected %s "
-                               "(dtype %s), got %s (dtype %s).") % (
+                               "(dtype %s), got %s (dtype %s)." %
-                            self.op, testname, i, inputs, expected,
+                               (self.op, testname, i, inputs, expected,
-                            expected.dtype, variable, variable.dtype))
+                                expected.dtype, variable, variable.dtype)))
            for description, check in iteritems(self.checks):
                if not check(inputs, variables):
@@ -250,36 +232,6 @@ def test_transfer_strided():
    assert numpy.all(fv == av)
-@may_fail("Op fails if both contexts are not the same and it's rare "
-          "that the tests will be run this way", ValueError)
-def test_transfer_cuda_gpu():
-    import theano.sandbox.cuda as cuda_ndarray
-    if cuda_ndarray.cuda_available is False:
-        raise SkipTest("Can't test interaction with cuda if cuda not present")
-    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
-    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
-    av = theano._asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
-    cv = cuda_ndarray.CudaNdarray(av)
-    gvs = gv[:, ::-2]
-    cvs = cv[:, ::-2]
-    f = theano.function([c], gpu_from_cuda(c))
-    fv = f(cv)
-    assert GpuArrayType.values_eq_approx(fv, gv)
-    fvs = f(cvs)
-    assert GpuArrayType.values_eq_approx(fvs, gvs)
-    f = theano.function([g], cuda_from_gpu(g))
-    fv = f(gv)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
-    fvs = f(gvs)
-    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
 def gpu_alloc_expected(x, *shp):
    g = gpuarray.empty(shp, dtype=x.dtype)
    g[:] = x
@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
    gpu_op=gpu_alloc,
    cases=dict(
        correct01=(rand(), numpy.int32(7)),
-# just gives a DeepCopyOp with possibly wrong results on the CPU
+        # just gives a DeepCopyOp with possibly wrong results on the CPU
-#        correct01_bcast=(rand(1), numpy.int32(7)),
+        # correct01_bcast=(rand(1), numpy.int32(7)),
        correct02=(rand(), numpy.int32(4), numpy.int32(7)),
        correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
        correct13=(rand(7), numpy.int32(2), numpy.int32(4),
@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
                          dtype='float32')
-    gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
-    host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
    f = theano.function([a], gpu_from_host(a), mode=m)
    assert gpu_from_host in [x.op
                             for x in f.maker.fgraph.toposort()]

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
@@ -6,8 +6,7 @@ import numpy
 import theano
 from theano import tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
+from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
-                                _dot22)
 from theano.tensor.tests.test_blas import TestGer, BaseGemv
 from .. import gpuarray_shared_constructor
@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
                             mode_with_gpu)
 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
-                    gpugemm_inplace, gpugemm_no_inplace,
+                    gpugemm_inplace,
                    gpuger_inplace, gpuger_no_inplace,
                    GpuGer, gpu_dot22, GpuGemm)
-GpuGemvTester = makeTester('GpuGemvTester',
+GpuGemvTester = makeTester(
+    'GpuGemvTester',
    op=gemv_inplace, gpu_op=gpugemv_inplace,
-                           cases=dict(
+    cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
-        dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
               dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
-#        test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
+               # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
-#        test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
+               # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
-#        test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
+               # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
               test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
               )
-)
+    )
 class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
@@ -48,10 +47,10 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
            return theano.shared(val)
-GpuGemmTester = makeTester('GpuGemmTester',
+GpuGemmTester = makeTester(
+    'GpuGemmTester',
    op=gemm_inplace, gpu_op=gpugemm_inplace,
-                           cases=dict(
+    cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
-        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
               test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
               test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
               test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
@@ -65,7 +64,7 @@ GpuGemmTester = makeTester('GpuGemmTester',
               # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
               # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
               )
-)
+    )
 class TestGpuSger(TestGer):
@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
    def test_f32_0_0(self):
        raise SkipTest('0-sized objects not supported')
    def test_f32_1_0(self):
        raise SkipTest('0-sized objects not supported')
    def test_f32_0_1(self):
        raise SkipTest('0-sized objects not supported')
@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
 GpuDot22Tester = makeTester(
-    'GpuGemmTester',
+    'GpuDot22Tester',
    op=_dot22, gpu_op=gpu_dot22,
    cases=dict(
        test1=[rand(3, 4), rand(4, 5)],
        test2=[rand(1, 4), rand(4, 5)],
        test3=[rand(3, 1), rand(1, 5)],
        test4=[rand(3, 4), rand(4, 1)],
-#        test5=[rand(0, 4), rand(4, 5)],
+        # test5=[rand(0, 4), rand(4, 5)],
-#        test6=[rand(3, 0), rand(0, 5)],
+        # test6=[rand(3, 0), rand(0, 5)],
-#        test7=[rand(3, 4), rand(4, 0)],
+        # test7=[rand(3, 4), rand(4, 0)],
-#        test8=[rand(0, 4), rand(4, 0)],
+        # test8=[rand(0, 4), rand(4, 0)],
-#        test9=[rand(0, 0), rand(0, 0)],
+        # test9=[rand(0, 0), rand(0, 0)],
    )
 )
 def test_hgemm_swap():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
@@ -149,6 +151,7 @@ def test_hgemm_swap():
    utt.assert_allclose(of, on)
 def test_hgemm_alpha_output_merge():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -6,32 +6,31 @@ import sys
 import time
 import unittest
 import numpy
 from six.moves import xrange
-from nose.plugins.skip import SkipTest
-imported_scipy_convolve2d = False
-try:
-    from scipy.signal import convolve2d
-    imported_scipy_convolve2d = True
-except ImportError:
-    pass
 import theano
 from theano import tensor
 from theano.tests.unittest_tools import seed_rng
 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
+from .test_basic_ops import mode_with_gpu
-                             mode_without_gpu)
 from ..type import GpuArrayType
 from ..conv import GpuConv
 from theano.sandbox.gpuarray import dnn
 import pygpu
+imported_scipy_convolve2d = False
+try:
+    from scipy.signal import convolve2d
+    imported_scipy_convolve2d = True
+except ImportError:
+    pass
 gftensor4 = GpuArrayType('float32', [False] * 4)
 def py_conv_valid_numpy(img, kern):
    assert img.shape[1] == kern.shape[1]
    outshp = (img.shape[0], kern.shape[0],
@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        cpu_mflops = approx_fp / (t1 - t0)
        gpu_mflops = approx_fp / (t2 - t1)
        if verbose > 0:
-            print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout)
+            print('%15s' % str(ishape), '%15s' % str(kshape), end=' ',
-            print('%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
+                  file=sys.stdout)
-                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout)
+            print('%12.5f  %7.2f %7.2f %7.1f' %
+                  (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
+                  file=sys.stdout)
    if not rval:
-        print(('test_' + mode + ' id=' + str(id) +
+        print('test_' + mode + ' id=' + str(id) +
              ' FAILED for ishape, kshape, mode, subsample,' +
              ' img_stride, kern_stride, version', ishape,
              kshape, mode, subsample, img_stride, kern_stride,
-                              version), file=sys.stdout)
+              version, file=sys.stdout)
        diff = cpuval - gpuval
        diffabs = numpy.absolute(diff)
        pr_diff = diffabs / numpy.absolute(cpuval)
@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                                   nb_close, "/", diff.size))
        print("max relatif diff:", (pr_diff.max(), "avg rel diff:",
                                    numpy.average(pr_diff)))
-    if not rval and print_ != False:
+    if not rval and print_ is not False:
        if npy_img.shape[0] > 5:
            print("img", npy_img[0])
            print("kern", npy_kern[0])
@@ -242,7 +243,8 @@ def exec_conv(version, shapes, verbose, random, mode,
                 istride, kstride) in enumerate(shapes):
            ret = False
            try:
-                ret = _params_allgood(ishape,
+                ret = _params_allgood(
+                    ishape,
                    kshape,
                    mode,
                    subsample=subshape,
@@ -297,15 +299,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
        ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
        # nkern only
        ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and nkern
+        # batch and nkern
        ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        #batch and stack
+        # batch and stack
        ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        #stack and nkern
+        # stack and nkern
        ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
+        # batch, nkern and stack
        ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        #batch, nkern and stack
+        # batch, nkern and stack
        ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
        ]
@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
 def get_valid_shapes():
    #          img shape,     kern shape, subsample shape
    shapes = get_basic_shapes()
@@ -361,37 +362,34 @@ def get_valid_shapes():
    shapes += [
        # other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
+        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize
+        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image
+        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image, non-square kern
+        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
+        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
+        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights
-            , ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1))  # added a test case that fail from test_nnet.py.test_conv_nnet2
+        ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)),  # added a test case that fail from test_nnet.py.test_conv_nnet2
-            , ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1))  # test precedent but reduced that triger the error
+        ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)),  # test precedent but reduced that triger the error
        # Test more than maxThreadsDim0
-            , ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-            , ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
        ]
-    shapes += [ ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 1 layers
+    shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 1 layers
-            , ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 2 layers
+               ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 2 layers
-            , ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 1 full
+               ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
-            , ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 2 valid
+               ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+               ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 1 layers
-            , ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 1 layers
+               ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 2 layers
-            , ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 2 layers
+               ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
-            , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
               ]
    return shapes
@@ -428,42 +426,34 @@ def test_full():
    shapes += [
        # other test
-              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
+        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize
+        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
-            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image
+        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
-            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize, non-square image, non-square kern
+        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
-            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1))  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
+        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
-            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
+        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
-            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights
        # other test
-            , ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
+        ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)),  # kernel bigger then image
-            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
+        ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
+        ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
-            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
+        ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)),  # kernel bigger then image
-            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
+        ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)),
        ]
    shapes += [
-#        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
+        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
-#            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
+        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
-             ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # test_lenet_28 bprop 1 full
-#            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
-#            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
-#            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
-            , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # test_lenet_64 full
-#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
-#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
        # Test more than maxThreadsDim0
-            , ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-            , ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
+        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
        ]
    version = [-1]
@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
                for mode in ['valid', 'full']:
                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
-                                   #((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
                                   # We use only the number of columns.
                                   ]:
@@ -580,47 +569,45 @@ def benchmark():
    shapes_valid = [
        # test_lenet_28 shape
-        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_32 shape
-        , ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # valid
+        ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_64 shape
-        , ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_108 shape
-        , ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1))  # valid
+        ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)),  # valid
        # test_lenet_256 shape
-        , ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1))  # valid
+        ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1))  # valid
+        ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1))  # valid
+        ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
-        , ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1))  # valid
+        ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)),  # valid
        ]
    shapes_full = [
        # test_lenet_28 shape
-         ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # full
+        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full
        # test_lenet_32 shape
-         , ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))  # full conv_full_patch_stack_padded' N=1
+        ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=1
        # test_lenet_64 shape
-         , ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # full conv_full_patch_stack_padded' N=3
+        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=3
        # test_lenet_108 shape
-         , ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))  # full 'conv_full_patch_stack_padded' N=9
+        ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full 'conv_full_patch_stack_padded' N=9
        # test_lenet_256 shape
-         , ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1))  # full conv_reference_full
+        ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)),  # full conv_reference_full
        ]
-#    shapes_valid=shapes_valid[-1:]
-#    shapes_full=shapes_full[-1:]
    version = [-1]
    verbose = 1
    random = True

--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
-import unittest
 from theano.tensor.nnet.tests import test_neighbours
 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
+from .test_basic_ops import mode_with_gpu
-                             mode_without_gpu)
 from ..neighbours import GpuImages2Neibs

--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
 from __future__ import print_function
-from nose.plugins.skip import SkipTest
 import numpy
 import unittest
@@ -7,8 +7,6 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
-from theano.sandbox import gpuarray
 # We let that import do the init of the back-end if needed.
 from .test_basic_ops import (mode_with_gpu,
                             mode_without_gpu)
@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
        n_in = 4098
        n_out = 4099
-    x = T.fmatrix('x')
    y = T.lvector('y')
    b = T.fvector('b')
-    #W = T.fmatrix('W')
    # we precompute the dot with big shape before to allow the test of
    # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
-    #(the launch timed out and was terminated) on GPU card not
+    # (the launch timed out and was terminated) on GPU card not
    # powerful enough. We need the big shape to check for corner
    # case.
    dot_result = T.fmatrix('dot_result')
@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
                       dtype=numpy.float32)
-    #?????yy = numpy.ones((batch_size,),dtype='float32')
    yy = numpy.ones((batch_size,), dtype='int32')
    b_values = numpy.zeros((n_out,), dtype='float32')
    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    classify_gpu = theano.function(inputs=[y, b, dot_result],
                                   outputs=[loss, y_pred, dW],
                                   mode=mode_with_gpu)
-    # theano.printing.debugprint(classify)
-    # theano.printing.debugprint(classify_gpu)
    assert any([isinstance(node.op,
                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    We check that we loop when their is too much threads
    """
-    n_in = 1000
    batch_size = 4097
    n_out = 1250
    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
-        n_in = 4098
        n_out = 4099
    # Seed numpy.random with config.unittests.rseed
@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    rtol = 1e-5
    atol = 1e-6
-    if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
+    utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
-        abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
-        scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
-        max_i = scaled_err.argmax()
-        print('max err index:', max_i, max_i / batch_size, end=' ')
-        print(max_i % batch_size, max_i / n_out, max_i & n_out)
-        print('At that index:')
-        print('err:', scaled_err.flatten()[max_i])
-        print('absolute error:', abs_err.flatten()[max_i])
-        print('relative error:', rel_err.flatten()[max_i])
-        print('cpu_out:', cpu_out.flatten()[max_i])
-        print('gpu_out:', gpu_out.flatten()[max_i])
-        print('softmax_output_value:', softmax_output_value.flatten()[max_i])
-        print('dnll_value:', dnll_value[max_i / n_out])
-        print('y_idx_value:', y_idx_value[max_i / n_out])
-        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
-            rtol, atol)
 def test_softmax_with_bias_float16():
@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
    softmax_with_bias_unittest_template(dtypeInput='float32',
                                        dtypeBias='float16')
 def test_softmax_with_bias_float32():
    softmax_with_bias_unittest_template(dtypeInput='float32',
                                        dtypeBias='float32')
@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
    TODO: check that we loop when there are too many threads. (THIS IS
    NOT IMPLEMENTED)
    """
    x = T.matrix('x', dtype=dtypeInput)
    b = T.vector('b', dtype=dtypeBias)
@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
 def test_softmax_float16():
    softmax_unittest_template('float16')
 def test_softmax_float32():
    softmax_unittest_template('float32')
 def test_softmax_float64():
    softmax_unittest_template('float64')

--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
-import operator
 import numpy
 import theano
@@ -25,7 +23,6 @@ def test_deep_copy():
 def test_values_eq_approx():
    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
    assert GpuArrayType.values_eq_approx(a, a)
    b = a.copy()
    b[0] = numpy.asarray(b[0]) + 1.

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -200,11 +200,12 @@ class GpuArrayType(Type):
                self.broadcastable == other.broadcastable)
    def convert_variable(self, var):
-        if (type(self) == type(var.type) and
+        vt = var.type
-                self.typecode == var.type.typecode and
+        if (type(self) == type(vt) and
-                self.ndim == var.type.ndim and
+                self.typecode == vt.typecode and
+                self.ndim == vt.ndim and
                all(sb == ob or ob for sb, ob in zip(self.broadcastable,
-                                                     var.type.broadcastable))):
+                                                     vt.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)
    def __hash__(self):

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -157,24 +157,11 @@ whitelist_flake8 = [
    "sandbox/linalg/ops.py",
    "sandbox/linalg/__init__.py",
    "sandbox/linalg/tests/test_linalg.py",
-    "sandbox/gpuarray/basic_ops.py",
-    "sandbox/gpuarray/nnet.py",
-    "sandbox/gpuarray/elemwise.py",
-    "sandbox/gpuarray/type.py",
    "sandbox/gpuarray/__init__.py",
-    "sandbox/gpuarray/kernel_codegen.py",
-    "sandbox/gpuarray/conv.py",
-    "sandbox/gpuarray/neighbours.py",
    "sandbox/gpuarray/tests/test_subtensor.py",
    "sandbox/gpuarray/tests/test_scan.py",
-    "sandbox/gpuarray/tests/test_neighbours.py",
-    "sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
-    "sandbox/gpuarray/tests/test_type.py",
    "sandbox/gpuarray/tests/test_opt.py",
-    "sandbox/gpuarray/tests/test_blas.py",
    "sandbox/gpuarray/tests/test_elemwise.py",
-    "sandbox/gpuarray/tests/test_nnet.py",
-    "sandbox/gpuarray/tests/test_basic_ops.py",
    "scan_module/scan_utils.py",
    "scan_module/scan_views.py",
    "scan_module/scan.py",