Merge pull request #3482 from abergeron/multi_gpu_new2

Multi-gpu support

Merge pull request #3482 from abergeron/multi_gpu_new2
4814cd99 · Pascal Lamblin · 6ca7b2b6 · ec927f7d · 4814cd99 · 4814cd99
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
 if (config.device.startswith('cuda') or
        config.device.startswith('opencl') or
        config.init_gpu_device.startswith('cuda') or
-        config.init_gpu_device.startswith('opencl')):
+        config.init_gpu_device.startswith('opencl') or
+        config.contexts != ''):
    import theano.sandbox.gpuarray

 # Use config.numpy to call numpy.seterr

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -111,6 +111,29 @@ AddConfigVar(
    BoolParam(False, allow_override=False),
    in_c_key=False)

+
+class ContextsParam(ConfigParam):
+    def __init__(self):
+        def filter(val):
+            if val == '':
+                return val
+            for v in val.split(';'):
+                s = v.split('->')
+                if len(s) != 2:
+                    raise ValueError("Malformed context map: %s" % (v,))
+            return val
+        ConfigParam.__init__(self, '', filter, False)
+
+AddConfigVar(
+    'contexts',
+    """
+    Context map for multi-gpu operation. Format is a
+    semicolon-separated list of names and device names in the
+    'name->dev_name' format. An example that would map name 'test' to
+    device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
+    "test->cuda0;test2->opencl0:0".
+    """, ContextsParam(), in_c_key=False)
+
 AddConfigVar(
    'print_active_device',
    "Print active device at when the GPU device is initialized.",

--- a/theano/misc/check_multi_gpu.py
+++ b/theano/misc/check_multi_gpu.py
+#! /usr/bin/env python
+"""
+This file compare the runtime of two independent dot products on one
+and two GPU to measure the speedup.
+
+This should be 2x if the GPUs are equivalent.
+"""
+import time
+
+import numpy
+
+import theano
+from theano.sandbox.gpuarray import init_dev
+from theano.sandbox.gpuarray.type import gpuarray_shared_constructor as shared
+from theano.sandbox.gpuarray.blas import gpu_dot22
+
+
+def main(dev1, dev2):
+    init_dev(dev1, 'ctx1')
+    init_dev(dev2, 'ctx2')
+
+    val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+
+    val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx2')
+    val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx2')
+
+    f1 = theano.function([], [gpu_dot22(val1a, val1b),
+                              gpu_dot22(val1c, val1d)])
+    f2 = theano.function([], [gpu_dot22(val1a, val1b),
+                              gpu_dot22(val2a, val2b)])
+
+    r = f1()
+    r[0].sync(), r[1].sync()
+    r = None
+    t = time.time()
+    r = f1()
+    r[0].sync(), r[1].sync()
+    t2 = time.time()
+    r = None
+
+    print("one ctx %f" % (t2 - t,))
+
+    r = f2()
+    r[0].sync(), r[1].sync()
+    r = None
+    t = time.time()
+    r = f2()
+    r[0].sync(), r[1].sync()
+    t2 = time.time()
+    r = None
+
+    print("two ctx %f" % (t2 - t,))
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 3:
+        raise ValueError("This script require two device names.")
+    main(sys.argv[1], sys.argv[2])
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, tensor.TensorType):
-            return [gpu_from_host(ev)]
-        else:
-            return [ev]
+        return self(ev)

    def infer_shape(self, node, xshp):
        return xshp
@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, CudaNdarrayType):
-            return [host_from_gpu(ev)]
-        else:
-            return [ev]
+        self(ev)

    def infer_shape(self, node, xshp):
        return xshp

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
    return False


+def typeInfer(node):
+    return typeConstructor
+
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
+               scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
                                             gpu_flag=True),
               75,
               'gpu',

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -21,26 +21,30 @@ except ImportError:

 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
-                  GpuArraySharedVariable, gpuarray_shared_constructor)
+                   GpuArraySharedVariable, gpuarray_shared_constructor,
+                   reg_context)
 from . import opt, nerv


-def init_dev(dev):
+def init_dev(dev, name=None):
    if pygpu.gpuarray.api_version() != (-10000, 0):
        raise RuntimeError("Wrong API version for gpuarray:",
                           pygpu.gpuarray.api_version(),
                           "Make sure Theano and libgpuarray/pygpu "
                           "are in sync.")
    global pygpu_activated
-    context = pygpu.init(dev)
-    pygpu.set_default_context(context)
+    if dev not in init_dev.devmap:
+        init_dev.devmap[dev] = pygpu.init(dev)
+    context = init_dev.devmap[dev]
+    # This will map the context name to the real context object.
+    reg_context(name, context)
    pygpu_activated = True
    if config.print_active_device:
-        print("Using device %s: %s" % (dev, context.devname), file=sys.stderr)
-    # remember the active device
-    init_dev.device = dev
+        print("Mapped name %s to device %s: %s" % (name, dev, context.devname),
+              file=sys.stderr)

-init_dev.device = None
+# This maps things like 'cuda0' to the context object on that device.
+init_dev.devmap = {}

 if pygpu:
    try:
@@ -52,11 +56,21 @@ if pygpu:
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
        elif (config.init_gpu_device.startswith('cuda') or
              config.init_gpu_device.startswith('opencl')):
+            if config.device != 'cpu':
+                raise ValueError('you must set device=cpu to use init_gpu_device.')
+            if config.contexts != '':
+                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
            init_dev(config.init_gpu_device)
+        if config.contexts != '':
+            for n, d in (c.split('->') for c in config.contexts.split(';')):
+                init_dev(d.strip(), n.strip())
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')

        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
-        from .basic_ops import host_from_gpu, gpu_from_host
+        from .basic_ops import host_from_gpu, GpuFromHost
        from .elemwise import GpuElemwise
        from .subtensor import (GpuSubtensor, GpuIncSubtensor,
                                GpuAdvancedIncSubtensor1)
@@ -67,5 +81,6 @@ else:
    if (config.init_gpu_device.startswith('cuda') or
            config.init_gpu_device.startswith('opencl') or
            config.device.startswith('opencl') or
-            config.device.startswith('cuda')):
+            config.device.startswith('cuda') or
+            config.contexts != ''):
        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 import os.path

-from theano import Apply, config
+from theano import Apply, config, Op

 from theano.compile import optdb
-from theano.gof import local_optimizer, LocalOptGroup
+from theano.gof import LocalOptGroup
 from theano.tensor.basic import as_tensor_variable
-from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.tensor.opt import in2out

-from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
+from .basic_ops import as_gpuarray_variable, infer_context_name
+
+from .opt_util import inplace_allocempty

 try:
    import pygpu
@@ -18,7 +19,7 @@ except ImportError as e:
    pass


-class BlasOp(HideC):
+class BlasOp(Op):
    def c_headers(self):
        return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']

@@ -28,34 +29,27 @@ class BlasOp(HideC):
    def c_init_code(self):
        return ['import_pygpu__blas();']

-    def c_support_code(self):
-        return """
-PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
-                                   PyGpuArrayObject *y) {
-  if (out &&
-      GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
-      theano_size_check(out, PyGpuArray_NDIM(y),
-                        PyGpuArray_DIMS(y),
-                        y->ga.typecode)) {
-    if (pygpu_move(out, y)) {
-      Py_XDECREF(out);
-      return NULL;
-    }
-  } else {
-    Py_XDECREF(out);
-    out = pygpu_copy(y, GA_ANY_ORDER);
-  }
-  return out;
-}
-"""
-
-
-class GpuGemv(BlasOp, Gemv):
+
+class GpuGemv(BlasOp):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, y, alpha, A, x, beta):
-        Gemv.make_node(self, y, alpha, A, x, beta)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(y, A, x)
+        A = as_gpuarray_variable(A, ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 2
+        assert x.ndim == 1
+        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [y, alpha, A, x, beta], [y.type()])

@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
        if self.inplace:
            code = """
                   if (%(y)s->ga.strides[0] <= 0) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(y)s);
+                     %(out)s = theano_try_copy(%(out)s, %(y)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(y)s);
+                   %(out)s = theano_try_copy(%(out)s, %(y)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
        return code

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)


-class GpuGemm(BlasOp, Gemm):
+class GpuGemm(BlasOp):
+    __props__ = ('inplace',)
    _f16_ok = True

+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, C, alpha, A, B, beta):
+        ctx_name = infer_context_name(C, A, B)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)
        alpha = as_tensor_variable(alpha)
        beta = as_tensor_variable(beta)
-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 2
+        assert B.ndim == 2
+        assert C.ndim == 2
        assert A.dtype == B.dtype == C.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
        if self.inplace:
            code = """
                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(C)s);
+                     %(out)s = theano_try_copy(%(out)s, %(C)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(C)s);
+                   %(out)s = theano_try_copy(%(out)s, %(C)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
        return code

    def c_code_cache_version(self):
-        return (4,)
-
+        return (5,)

 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)


-class GpuGer(BlasOp, Ger):
+class GpuGer(BlasOp):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, A, alpha, x, y):
-        Ger.make_node(self, A, alpha, x, y)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(A, x, y)
+        A = as_gpuarray_variable(A, ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        assert alpha.ndim == 0
+        assert A.ndim == 2
+        assert x.ndim == 1
+        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [A, alpha, x, y], [A.type()])

    def perform(self, node, inp, out):
        A, alpha, x, y = inp
-        inplace = self.destructive
+        inplace = self.inplace
        if inplace and not A.flags.forc:
            inplace = False
        out[0][0] = blas.ger(alpha, x, y, A,
@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
                    fail=sub['fail'], name=name)
-        if self.destructive:
+        if self.inplace:
            code = """
                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(A)s);
+                     %(out)s = theano_try_copy(%(out)s, %(A)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(A)s);
+                   %(out)s = theano_try_copy(%(out)s, %(A)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)
+

+gpuger_no_inplace = GpuGer(inplace=False)
+gpuger_inplace = GpuGer(inplace=True)

-gpuger_no_inplace = GpuGer(destructive=False)
-gpuger_inplace = GpuGer(destructive=True)

+class GpuDot22(BlasOp):
+    __props__ = ()

-class GpuDot22(BlasOp, Dot22):
    def make_node(self, x, y):
-        Dot22.make_node(self, x, y)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        assert x.ndim == 2
+        assert y.ndim == 2
        assert x.dtype == y.dtype
        return Apply(self, [x, y], [x.type()])

@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
        dims[1] = PyGpuArray_DIMS(%(B)s)[1];

        if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
-                              pygpu_default_context())) {
+                               %(A)s->context)) {
            %(fail)s
        }

@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
        return code

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

 gpu_dot22 = GpuDot22()


-@local_optimizer([gpugemv_no_inplace], inplace=True)
-def local_inplace_gpuagemv(node):
-    if node.op == gpugemv_no_inplace:
-        return [gpugemv_inplace(*node.inputs)]
+@inplace_allocempty(GpuGemv, 0)
+def local_inplace_gpuagemv(node, inputs):
+    return [gpugemv_inplace(*inputs)]


-@local_optimizer([gpugemm_no_inplace], inplace=True)
-def local_inplace_gpuagemm(node):
-    if node.op == gpugemm_no_inplace:
-        inputs = list(node.inputs)
-        C = inputs[0]
-        if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
-                len(C.clients) > 1):
-            inputs[0] = C.owner.op(*C.owner.inputs)
-        return [gpugemm_inplace(*inputs)]
+@inplace_allocempty(GpuGemm, 0)
+def local_inplace_gpuagemm(node, inputs):
+    return [gpugemm_inplace(*inputs)]


-@local_optimizer([gpuger_no_inplace], inplace=True)
-def local_inplace_gpuager(node):
-    if node.op == gpuger_no_inplace:
-        return [gpuger_inplace(*node.inputs)]
+@inplace_allocempty(GpuGer, 0)
+def local_inplace_gpuager(node, inputs):
+    return [gpuger_inplace(*inputs)]

 gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
                                            local_inplace_gpuagemm,

--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
 import copy
 import os

-import theano
-from theano import config, gof
+from theano import gof

 try:
    from pygpu import gpuarray
@@ -10,7 +9,8 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from theano.gof import utils


@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
        them.

    """
+    __props__ = ('border_mode', 'subsample', 'logical_img_hw',
+                 'logical_kern_hw', 'logical_kern_align_top', 'version',
+                 'verbose', 'kshp', 'imshp', 'max_threads_dim0')

    @staticmethod
    def logical_output_shape_2d(imshp, kshp, mode):
@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
        raise ValueError(mode)

-    def __init__(self, border_mode,
-                 subsample=(1, 1),
-                 logical_img_hw=None,
-                 logical_kern_hw=None,
+    def __init__(self, border_mode, subsample=(1, 1),
+                 logical_img_hw=None, logical_kern_hw=None,
                 logical_kern_align_top=True,
-                 version=-1,
-                 direction_hint=None,
-                 verbose=0,
-                 kshp=None,
-                 imshp=None,
+                 version=-1, direction_hint=None,
+                 verbose=0, kshp=None, imshp=None,
                 max_threads_dim0=None,
-                 nkern=None,
-                 bsize=None,
-                 fft_opt=True):
+                 nkern=None, bsize=None, fft_opt=True):
        self.border_mode = border_mode
        self.subsample = subsample
        if logical_img_hw is not None:
@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        self.bsize = bsize
        self.fft_opt = fft_opt

-    def __eq__(self, other):
-        return type(self) == type(other) \
-            and self.border_mode == other.border_mode \
-            and self.subsample == other.subsample \
-            and self.logical_img_hw == other.logical_img_hw \
-            and self.logical_kern_hw == other.logical_kern_hw \
-            and self.logical_kern_align_top == other.logical_kern_align_top \
-            and self.version == other.version \
-            and self.verbose == other.verbose \
-            and self.kshp == other.kshp\
-            and self.imshp == other.imshp\
-            and self.max_threads_dim0 == other.max_threads_dim0
-
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, "imshp"):
@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        if not hasattr(self, "fft_opt"):
            self.fft_opt = True

-    def __hash__(self):
-        # don't use hash(self.version) as hash(-1)==-2 and
-        # hash(-2)==-2 in python!
-        return hash(type(self)) \
-            ^ hash(self.border_mode) \
-            ^ hash(self.subsample) \
-            ^ hash(self.logical_img_hw) \
-            ^ hash(self.logical_kern_hw) \
-            ^ hash(self.logical_kern_align_top) \
-            ^ self.version \
-            ^ hash(self.verbose) \
-            ^ hash(self.kshp)\
-            ^ hash(self.imshp)\
-            ^ hash(self.max_threads_dim0)
-
-    def __str__(self):
-        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            str(self.logical_img_hw),
-            str(self.logical_kern_hw),
-            str(self.logical_kern_align_top),
-            str(self.imshp),
-            str(self.kshp))
-
    def make_node(self, img, kern):
        if img.dtype != "float32" or kern.dtype != "float32":
            raise NotImplementedError("GpuConv currently only work"
@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
            raise TypeError('img must be 4D tensor')
        if kern.type.ndim != 4:
            raise TypeError('kern must be 4D tensor')
-        img = as_gpuarray_variable(img)
-        kern = as_gpuarray_variable(kern)
+        ctx_name = infer_context_name(img, kern)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
-        out = GpuArrayType(img.dtype, broadcastable)()
+        out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
        return gof.Apply(self, [img, kern], [out])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def flops(self, inputs, outputs):
        """
        Useful with the hack in profilemode to print the MFlops.
@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        node_ = copy.copy(node)
        assert node.op is node_.op
-        if config.gpuarray.sync:
-            raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
        if node_.op.max_threads_dim0 is None:
-            cuda = theano.sandbox.cuda
-            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+            node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
                                                   compute_map, no_recycling)

@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 22)
+        return (0, 23)

    def c_code(self, node, nodename, inp, out_, sub):
+        if node.inputs[0].type.context.kind != "cuda":
+            raise NotImplementedError("GpuConv only works for cuda devices")
        img, kern = inp
        out, = out_
        dx = self.subsample[0]
@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        """ % locals()
        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
-        kname = "conv_full_load_everything"
        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
        bin = gk._binary
        bcode = ','.join(hex(ord(c)) for c in bin)
@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
        static const char conv_bcode[] = {%(bcode)s};
        static const char *conv_code = "%(code)s";
        """ % locals()
-        for k in kernels:
-            mod += "static GpuKernel " + k.name + '_' + name + ";\n"
-        mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
+        return mod
+
+    def c_support_code_struct(self, node, name):
+        mod = GpuKernelBase.c_support_code_struct(self, node, name)
+        with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
+            mod += f.read()
        return mod

    @utils.memoize

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {

 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
 //bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers

 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
      if (thread_id < nb_thread)
        {
          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
+                  ((size_t)src) & COALESCED_ALIGN);
          my_src_ptr += thread_id;
          while (my_src_ptr < src + N)
          {

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
 #section init_code_struct

 {
-  cuda_enter(pygpu_default_context()->ctx);
+  cuda_enter(CONTEXT->ctx);
  cudnnStatus_t err;
  APPLY_SPECIFIC(_handle) = NULL;
  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
                 cudnnGetErrorString(err));
-    cuda_exit(pygpu_default_context()->ctx);
+    cuda_exit(CONTEXT->ctx);
    FAIL;
  }
-  cuda_exit(pygpu_default_context()->ctx);
+  cuda_exit(CONTEXT->ctx);
 }
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
-                         PyGpuArrayObject **output) {
+                         PyGpuArrayObject **output,
+                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **input) {
+                        double alpha, double beta, PyGpuArrayObject **input,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **kerns) {
+                        double alpha, double beta, PyGpuArrayObject **kerns,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI

 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             cudnnPoolingDescriptor_t desc,
-                             PyGpuArrayObject **out) {
+                             PyGpuArrayObject **out,
+                             PyGpuContextObject *c) {
  cudnnStatus_t err;
  size_t dims[5];
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
                                  cudnnPoolingDescriptor_t desc,
-                                  PyGpuArrayObject **inp_grad) {
+                                  PyGpuArrayObject **inp_grad,
+                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,

  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
                         PyGpuArray_DIMS(inp), inp->ga.typecode,
-                         GA_C_ORDER, pygpu_default_context()) != 0) {
+                         GA_C_ORDER, c) != 0) {
    return 1;
  }


--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
 #section support_code_struct

 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
-                            PyGpuArrayObject **out) {
+                            PyGpuArrayObject **out,
+                            PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)

 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
-                                 PyGpuArrayObject **dx) {
+                                 PyGpuArrayObject **dx,
+                                 PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
@@ -2,7 +2,7 @@

 /* Why do we need this? */
 size_t dim = 2048 * 32;
-rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
+rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
                       Py_None);
 if (rand_buf == NULL) {
  FAIL;
@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;

 int gemm16(PyGpuArrayObject *C, float alpha,
           PyGpuArrayObject *A, PyGpuArrayObject *B,
-           float beta, PyGpuArrayObject **out) {
+           float beta, PyGpuArrayObject **out,
+           PyGpuContextObject *c) {
  PyGpuArrayObject *_A = NULL;
  PyGpuArrayObject *_B = NULL;
  GpuKernel *gk;

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
@@ -10,7 +10,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType

@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        self.mode = mode

    def make_node(self, ten4, neib_shape, neib_step):
-        ten4 = as_gpuarray_variable(ten4)
+        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
        neib_shape = T.as_tensor_variable(neib_shape)
        neib_step = T.as_tensor_variable(neib_step)

@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):

        return Apply(self, [ten4, neib_shape, neib_step],
                     [GpuArrayType(broadcastable=(False, False),
-                                   dtype=ten4.type.dtype)()])
+                                   dtype=ten4.type.dtype,
+                                   context_name=ten4.type.context_name)()])
+
+    def get_context(self, node):
+        return node.inputs[0].type.context

    def c_code_cache_version(self):
        return (11,)
@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
-//a version that use less register but don't work in all case.
+// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        return kernels

    def c_code(self, node, name, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        dtype_ten4 = node.inputs[0].dtype
        dtype_neib_shape = node.inputs[1].dtype
        dtype_neib_step = node.inputs[2].dtype
@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        ten4, neib_shape, neib_step = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        mode = self.mode
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                dims[0] = z_dim0;
                dims[1] = z_dim1;
                %(z)s = pygpu_empty(2, dims, %(typecode_z)s,
-                                    GA_C_ORDER, pygpu_default_context(),
-                                    Py_None);
+                                    GA_C_ORDER, %(ctx)s, Py_None);
                if (!%(z)s)
                {
                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):


 @op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node):
+def use_gpu_images2neibs(node, context_name):
    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
        return GpuImages2Neibs(node.op.mode)


--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
 from theano.scalar import as_scalar, constant

 from . import opt
-from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
+from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
+                        infer_context_name)
+from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge
-from .pycuda_helper import ensure_pycuda_context
-

 try:
    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
@@ -43,6 +43,7 @@ def ensure_float(val, name):
 class Gemm16(COp):
    __props__ = ('relu', 'inplace')
    _f16_ok = True
+    context_type = gpu_context_type
    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
                  'tn_128x128', 'tn_128x64', 'tn_128x32',
@@ -61,10 +62,11 @@ class Gemm16(COp):
    def make_node(self, C, alpha, A, B, beta):
        if GPUTensor is None:
            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
+        ctx_name = infer_context_name(C, A, B)

-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)

        alpha = ensure_float(alpha, 'alpha')
        beta = ensure_float(beta, 'beta')
@@ -73,27 +75,8 @@ class Gemm16(COp):

        return Apply(self, [C, alpha, A, B, beta], [C.type()])

-    def perform(self, node, inputs, outputs):
-        ensure_pycuda_context()
-        C, alpha, A, B, beta = inputs
-        # The nervana code does not support the case where both inputs
-        # are trans, so we need to copy one if them if that is the
-        # case. We copy the smaller one.
-        if A.flags.f_contiguous and B.flags.f_contiguous:
-            if A.size < B.size:
-                A = A.copy()
-            else:
-                B = B.copy()
-        inplace = self.inplace
-        if inplace and not C.flags.c_contiguous:
-            inplace = False
-        if not inplace:
-            C = C.copy()
-        At = to_gputensor(A)
-        Bt = to_gputensor(B)
-        Ct = to_gputensor(C)
-        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
-        outputs[0][0] = C
+    def get_context(self, node):
+        return node.inputs[0].type.context

    def c_headers(self):
        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
        codel.append("const char *bcode;")
        codel.append("size_t sz;")
-        codel.append("PyGpuContextObject *c = pygpu_default_context();")
+        codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,

 @opt.register_opt()
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node):
+def local_dot_to_gemm16(node, ctx_name):
    if nerv is None:
        return
    A = node.inputs[0]
@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = node.inputs[0].fgraph
-        C = GpuAllocEmpty(dtype='float16')(
+        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -10,7 +10,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
                             inline_softmax,
@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.

    """
-
    nin = 3
    nout = 3
    __props__ = ()
    _f16_ok = True

    def make_node(self, x, b, y_idx):
-        # N.B. won't work when we don't cast y_idx to float anymore
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
-        y_idx = as_gpuarray_variable(y_idx)
+        ctx_name = infer_context_name(x, b, y_idx)
+        x = as_gpuarray_variable(x, ctx_name)
+        b = as_gpuarray_variable(b, ctx_name)
+        y_idx = as_gpuarray_variable(y_idx, ctx_name)
        nll = GpuArrayType(x.type.dtype,
-                           y_idx.type.broadcastable)()
+                           y_idx.type.broadcastable,
+                           context_name=ctx_name)()
        sm = x.type()
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                       flags=flags, objvar=k_var)]

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError('cuda only')
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
+        ctx = sub['context']
        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_x)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_x)s, GA_C_ORDER, %(ctx)s,
+                                Py_None);
            if (!%(nll)s) {
                %(fail)s
            }
@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode_b)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_b)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(am)s);
            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_y_idx)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_y_idx)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
    Gradient wrt x of the CrossentropySoftmax1Hot Op.

    """
-
    nin = 3
    nout = 1
    __props__ = ()
    _f16_ok = True

    def make_node(self, dnll, sm, y_idx):
-        dnll = as_gpuarray_variable(dnll)
-        sm = as_gpuarray_variable(sm)
-        y_idx = as_gpuarray_variable(y_idx)
+        ctx_name = infer_context_name(dnll, sm, y_idx)
+        dnll = as_gpuarray_variable(dnll, ctx_name)
+        sm = as_gpuarray_variable(sm, ctx_name)
+        y_idx = as_gpuarray_variable(y_idx, ctx_name)
        return Apply(self, [dnll, sm, y_idx], [sm.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def c_code_cache_version(self):
        return (11,)

@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
+        ctx = sub['context']
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
-                                 %(typecode_dx)s,
-                                 GA_C_ORDER,
-                                 pygpu_default_context(), Py_None);
+                                 %(typecode_dx)s, GA_C_ORDER,
+                                 %(ctx)s, Py_None);
            if (!%(dx)s) {
                %(fail)s
            }
@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
    Implement Softmax on the gpu.

    """
-
    __props__ = ()
    _f16_ok = True

    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
        return Apply(self, [x], [x.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def infer_shape(self, node, shape):
        return shape

@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        dtype_x = node.inputs[0].dtype
        work_x = work_dtype(dtype_x)
        dtype_z = node.outputs[0].dtype
@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        x, = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }
@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
 gpu_softmax = GpuSoftmax()


-class GpuSoftmaxWithBias (GpuKernelBase, Op):
+class GpuSoftmaxWithBias(GpuKernelBase, Op):
    """
    Implement SoftmaxWithBias on the gpu.

    """
-
    nin = 2
    nout = 1
    __props__ = ()
    _f16_ok = True

    def make_node(self, x, b):
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
+        ctx_name = infer_context_name(x, b)
+        x = as_gpuarray_variable(x, ctx_name)
+        b = as_gpuarray_variable(b, ctx_name)
        return Apply(self, [x, b], [x.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def infer_shape(self, node, shape):
        return [shape[0]]

@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError('cuda only')
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_z = node.outputs[0].dtype
@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        x, b = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
    function can be as simple as:

        def maker(node, inputs):
-            return node.op.__class__(inplace=True)(*inputs)
+            return [node.op.__class__(inplace=True)(*inputs)]

    Parameters
    ----------
@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
+                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
+                                         alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/sandbox/gpuarray/pycuda_helper.py
+++ b/theano/sandbox/gpuarray/pycuda_helper.py
-try:
-    from pycuda.driver import Context
-    if not hasattr(Context, 'attach'):
-        raise ImportError('too old')
-except ImportError:
-    Context = None
-
-pycuda_initialized = False
-pycuda_context = None
-
-
-def ensure_pycuda_context():
-    global pycuda_context, pycuda_initialized
-    if not pycuda_initialized:
-        if Context is None:
-            raise RuntimeError("PyCUDA not found or too old.")
-        else:
-            pycuda_context = Context.attach()
-            import atexit
-            atexit.register(pycuda_context.detach)
-            pycuda_initialized = True
-    return pycuda_context
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 from __future__ import print_function

-import copy
 import os
+import copy

 import numpy

 import theano
-from theano import tensor, gof, config
-from theano.gof.utils import MethodNotDefined
+from theano import tensor, gof
 from six.moves import StringIO
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
@@ -19,7 +18,8 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .elemwise import GpuElemwise


@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
    _f16_ok = True

    def make_node(self, x, *inputs):
+        ctx_name = infer_context_name(x)
        rval = tensor.Subtensor.make_node(self, x, *inputs)
        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
-                             broadcastable=rval.outputs[0].type.broadcastable)
-        x = as_gpuarray_variable(x)
+                             broadcastable=rval.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])

    def perform(self, node, inputs, out_):
@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)

    def make_node(self, x, y, *inputs):
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        op = copy.copy(self)
        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
        op.create_iadd_node(ret)
        return ret

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
    def create_iadd_node(self, node):
        # We store a iadd_node in the op that contain the info needed
        # for the inplace add.
@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        iadd_node = gop(xview, y).owner
        self.iadd_node = iadd_node

-    def perform(self, node, inputs, out_):
+    def perform(self, node, inputs, out_, ctx):
        out, = out_
        x, y = inputs[:2]
        indices = list(reversed(inputs[2:]))
@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
                                  %(view_ndim)s,
                                  dims,
                                  xview_strides,
-                                  pygpu_default_context(),
+                                  %(x)s->context,
                                  1,
                                  (PyObject *)%(x)s,
                                  (PyObject *)&PyGpuArrayType);
@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()

-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        gop = self.iadd_node.op
        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
+        ret = gop.c_support_code_struct(self.iadd_node, sub_name)
        ret += """
        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
                                                    PyGpuArrayObject* src){
@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """ % locals()
        inputs = ["dst", "src"]
        outputs = ["ret"]
-        sub = {"fail": "return NULL;"}
+        sub = {"fail": "return NULL;", "context": "dst->context"}
        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
        ret += """
-            return dst;
+            return ret;
+
        }
        """
        return ret
@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):

 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
    def make_node(self, x, ilist):
-        x_ = as_gpuarray_variable(x)
+        ctx_name = infer_context_name(x, ilist)
+        x_ = as_gpuarray_variable(x, ctx_name)

        ilist__ = tensor.as_tensor_variable(ilist)
        if ilist__.type.dtype[:3] not in ('int', 'uin'):
@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
        if ilist__.type.dtype != 'int64':
            ilist__ = tensor.cast(ilist__, 'int64')

-        ilist_ = as_gpuarray_variable(ilist__)
+        ilist_ = as_gpuarray_variable(ilist__, ctx_name)

        if ilist_.type.dtype != 'int64':
            raise TypeError('index must be int64')
@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
        bcast = ilist_.broadcastable + x_.broadcastable[1:]
        return gof.Apply(self, [x_, ilist_],
                         [GpuArrayType(dtype=x.dtype,
+                                       context_name=ctx_name,
                                       broadcastable=bcast)()])

    def perform(self, node, inp, out_):
@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
    """

    def make_node(self, x, y, ilist):
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x_ = as_gpuarray_variable(x, ctx_name)
+        y_ = as_gpuarray_variable(y, ctx_name)
        ilist_ = tensor.as_tensor_variable(ilist)

        assert x_.type.dtype == y_.type.dtype
@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
    only avail on compute capability 2.0 and more recent.

    """
-
    _f16_ok = True

    def make_node(self, x, y, ilist):
        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
        the index are of type long.
        """
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
-        ilist_ = as_gpuarray_variable(ilist)
+        ctx_name = infer_context_name(x, y, ilist)
+        x_ = as_gpuarray_variable(x, ctx_name)
+        y_ = as_gpuarray_variable(y, ctx_name)
+        ilist_ = as_gpuarray_variable(ilist, ctx_name)

        assert x_.type.dtype == y_.type.dtype
        assert x_.type.ndim >= y_.type.ndim
@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):

        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
+    def perform(self, node, inp, out, ctx):
+        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
+
    def c_code_cache_version(self):
        return (6,)

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<numpy_compat.h>', '<gpuarray_helper.h>',
+        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
                '<gpuarray/types.h>']

    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        res = [os.path.dirname(__file__)]
-        if cuda_root:
-            res.append(os.path.join(cuda_root, 'include'))
-        return res
+        return [os.path.dirname(__file__)]

    def c_code(self, node, name, inputs, outputs, sub):
-        active_device_no = theano.sandbox.cuda.active_device_number()
-        device_properties = theano.sandbox.cuda.device_properties
-        compute_capability = device_properties(active_device_no)['major']
-        if ((self.set_instead_of_inc) or
-                (node.inputs[0].ndim != node.inputs[1].ndim) or
-                (node.inputs[0].ndim != 2) or
-                (compute_capability < 2)):
+        ctx = self.get_context(node)
+        if ctx.kind != 'cuda':
+            raise NotImplementedError("cuda only")
+        if (self.set_instead_of_inc or
+                node.inputs[0].ndim != node.inputs[1].ndim or
+                node.inputs[0].ndim != 2 or
+                ctx.bin_id[-2] < '2'):
            raise NotImplementedError("This case does not have C code yet.")

        x = inputs[0]
@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]

-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        itemsize_out = numpy.dtype(dtype_out).itemsize
        k_var = "k_vector_add_fast_" + nodename

-        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
+        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                     PyGpuArrayObject* py_other,
                                     PyGpuArrayObject *indices_arr)

--- a/theano/sandbox/gpuarray/tests/config.py
+++ b/theano/sandbox/gpuarray/tests/config.py
+from nose.plugins.skip import SkipTest
+
+import theano.sandbox.gpuarray
+
+if theano.sandbox.gpuarray.pygpu is None:
+    raise SkipTest("pygpu not installed")
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    import theano.sandbox.cuda as cuda_ndarray
+    if cuda_ndarray.cuda_available:
+        cuda_ndarray.use('gpu', default_to_move_computation_to_gpu=False,
+                         move_shared_float32_to_gpu=False,
+                         enable_cuda=False)
+        theano.sandbox.gpuarray.init_dev('cuda')
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+
+test_ctx_name = None
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
 from theano.tensor.tests import test_basic
 from theano.tensor.tests.test_basic import rand, safe_make_node
 from theano.tests import unittest_tools as utt
-from theano.tests.unittest_tools import SkipTest

-import theano.sandbox.gpuarray
-
-from ..type import (GpuArrayType,
+from ..type import (GpuArrayType, get_context,
                    gpuarray_shared_constructor)
 from ..basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
-    gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
+    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape,
+    GpuAlloc, GpuAllocEmpty, GpuContiguous,
    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
 from ..subtensor import GpuSubtensor

-import theano.sandbox.cuda as cuda_ndarray
-
-try:
-    from pygpu import gpuarray
-except:
-    pass
-
-if theano.sandbox.gpuarray.pygpu is None:
-    raise SkipTest("pygpu not installed")
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name

-# If you are writing a new test file, don't copy this code, but rather
-# import stuff from this file (like mode_with_gpu) to reuse it.
-if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
-    if not cuda_ndarray.use.device_number:
-        # We should not enable all the use like the flag device=gpu,
-        # as many tests don't work in that setup.
-        cuda_ndarray.use('gpu',
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False)
-    theano.sandbox.gpuarray.init_dev('cuda')
-
-if not theano.sandbox.gpuarray.pygpu_activated:
-    raise SkipTest("pygpu disabled")
+from pygpu import gpuarray

 utt.seed_rng()
 rng = numpy.random.RandomState(seed=utt.fetch_seed())

-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
-

 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
    cls = kwargs.pop('cls', None)
    if len(kwargs) != 0:
        raise TypeError('Unexpected argument %s', list(kwargs.keys())[0])
-    return gpuarray.array(r, dtype=dtype, cls=cls)
+    return gpuarray.array(r, dtype=dtype, cls=cls,
+                          context=get_context(test_ctx_name))


 def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,

        def test_all(self):
            if skip:
+                from nose.plugins.skip import SkipTest
                raise SkipTest(skip)

            for testname, inputs in iteritems(cases):
@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')

    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
+    gv = gpuarray.array(av, context=get_context(test_ctx_name))

-    f = theano.function([a], gpu_from_host(a))
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)

@@ -218,12 +189,12 @@ def test_transfer_strided():
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')

    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
-    gv = gpuarray.array(av)
+    gv = gpuarray.array(av, context=get_context(test_ctx_name))

    av = av[:, ::2]
    gv = gv[:, ::2]

-    f = theano.function([a], gpu_from_host(a))
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)

@@ -233,14 +204,14 @@ def test_transfer_strided():


 def gpu_alloc_expected(x, *shp):
-    g = gpuarray.empty(shp, dtype=x.dtype)
+    g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
    g[:] = x
    return g

 GpuAllocTester = makeTester(
    name="GpuAllocTester",
    op=alloc,
-    gpu_op=gpu_alloc,
+    gpu_op=GpuAlloc(test_ctx_name),
    cases=dict(
        correct01=(rand(), numpy.int32(7)),
        # just gives a DeepCopyOp with possibly wrong results on the CPU
@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
    dtype = "float32"
    mode = mode_with_gpu
    shared = staticmethod(gpuarray_shared_constructor)
-    allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()]
+    allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), T.Alloc()]


 def test_alloc_empty():
    for dt in ['float32', 'int8']:
-        f = theano.function([], GpuAllocEmpty(dt)(2, 3))
+        f = theano.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
        assert len(f.maker.fgraph.apply_nodes) == 1
        out = f()
        assert out.shape == (2, 3)
        assert out.dtype == dt

-    f = theano.function([], [GpuAllocEmpty('uint64')(3, 2),
-                             GpuAllocEmpty('uint64')(3, 2)])
+    f = theano.function([], [GpuAllocEmpty('uint64', test_ctx_name)(3, 2),
+                             GpuAllocEmpty('uint64', test_ctx_name)(3, 2)])
    out = f()
    assert out[0].shape == (3, 2)
    assert out[0].dtype == 'uint64'
@@ -284,7 +255,7 @@ def test_alloc_empty():

 def test_shape():
    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
-    v = gpuarray.zeros((3, 4, 5), dtype='float32')
+    v = gpuarray.zeros((3, 4, 5), dtype='float32', context=get_context(test_ctx_name))
    f = theano.function([x], x.shape)
    topo = f.maker.fgraph.toposort()
    assert numpy.all(f(v) == (3, 4, 5))
@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
-                          dtype='float32')
+                          dtype='float32',
+                          context=get_context(test_ctx_name))

-    f = theano.function([a], gpu_from_host(a), mode=m)
-    assert gpu_from_host in [x.op
-                             for x in f.maker.fgraph.toposort()]
-    f = theano.function([a], gpu_from_host(a).shape, mode=m)
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
+    assert any(isinstance(x.op, GpuFromHost)
+               for x in f.maker.fgraph.toposort())
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, T.opt.Shape_i)
    assert isinstance(topo[1].op, T.opt.Shape_i)

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
 from theano.tensor.tests.test_blas import TestGer, BaseGemv

 from .. import gpuarray_shared_constructor
-from .test_basic_ops import (makeTester, rand,
-                             mode_with_gpu)
+from .config import mode_with_gpu
+from .test_basic_ops import makeTester, rand

 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace,
@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
        self.ops = [gpuger_no_inplace, gpuger_inplace]

    def clone(self, op):
-        return GpuGer(destructive=op.destructive)
+        return GpuGer(inplace=op.inplace)


 GpuDot22Tester = makeTester(

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -14,8 +14,8 @@ from theano import tensor
 from theano.tests.unittest_tools import seed_rng

 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import mode_with_gpu
-from ..type import GpuArrayType
+from .config import mode_with_gpu, test_ctx_name
+from ..type import GpuArrayType, get_context
 from ..conv import GpuConv
 from theano.sandbox.gpuarray import dnn

@@ -28,7 +28,7 @@ try:
 except ImportError:
    pass

-gftensor4 = GpuArrayType('float32', [False] * 4)
+gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)


 def py_conv_valid_numpy(img, kern):
@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
            numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
            numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
-    img = pygpu.array(npy_img)
-    kern = pygpu.array(npy_kern)
+    img = pygpu.array(npy_img, context=get_context(test_ctx_name))
+    kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
 from .. import dnn
 from ..basic_ops import GpuAllocEmpty

-from .test_basic_ops import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from . import test_nnet


 def test_dnn_conv_desc_merge():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    kern_shp = T.as_tensor_variable(
        numpy.asarray([3, 1, 2, 2]).astype('int64'))
@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge():

 def test_dnn_conv_merge():
    # This test that we merge correctly multiple dnn_conv.
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img_shp = [2, 5, 6, 8]
    kern_shp = [3, 5, 5, 6]
@@ -80,7 +80,7 @@ def test_dnn_conv_inplace():
    GpuAllocEmpty get merged together.

    """
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img_shp = [2, 5, 6, 8]
    kern_shp = [3, 5, 5, 6]
@@ -105,7 +105,7 @@ def test_dnn_conv_inplace():
    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2

    # Test grad w op
-    out = GpuAllocEmpty(kern.dtype)(*kern.shape)
+    out = GpuAllocEmpty(kern.dtype, test_ctx_name)(*kern.shape)
    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
@@ -116,7 +116,7 @@ def test_dnn_conv_inplace():
    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2

    # Test grad i op
-    out = GpuAllocEmpty(img.dtype)(*img.shape)
+    out = GpuAllocEmpty(img.dtype, test_ctx_name)(*img.shape)
    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,


 def test_pooling():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

    x = T.ftensor4()
@@ -269,7 +269,7 @@ def test_pooling():


 def test_pooling_opt():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

    x = T.fmatrix()
@@ -318,7 +318,7 @@ def test_dnn_tag():
            max_pool_2d(x, ds=(2, 2), ignore_border=True),
            mode=mode_with_gpu.including("cudnn"))
    except (AssertionError, RuntimeError):
-        assert not dnn.dnn_available()
+        assert not dnn.dnn_available(test_ctx_name)
        raised = True
    finally:
        theano.config.on_opt_error = old
@@ -327,7 +327,7 @@ def test_dnn_tag():
        logging.getLogger('theano').addHandler(theano.logging_default_handler)

    if not raised:
-        assert dnn.dnn_available()
+        assert dnn.dnn_available(test_ctx_name)
        assert any([isinstance(n.op, dnn.GpuDnnPool)
                    for n in f.maker.fgraph.toposort()])

@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        self.mode = mode_with_gpu

    def test_softmax(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        t = T.ftensor4('t')
        rand_tensor = numpy.asarray(
@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        )

    def test_conv(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_conv_gradw(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_conv_gradi(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_pool(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        img_val = numpy.asarray(
@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_pool_grad(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        img_grad = T.ftensor4('img_grad')
@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester):

 # this has been a problem in the past
 def test_dnn_conv_border_mode():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode():


 def test_dnn_conv_alpha_output_merge():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
@@ -678,7 +678,7 @@ def test_dnn_conv_grad():


 def test_version():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    assert isinstance(dnn.version(), int)


--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -4,19 +4,19 @@ import theano
 from theano import scalar, gof
 from theano.tests.unittest_tools import SkipTest, assert_allclose

-from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
-                                               test_CAReduce, T_reduce_dtype)
+from theano.tensor.tests import test_elemwise

-from .test_basic_ops import mode_with_gpu, rand_gpuarray
+from .config import mode_with_gpu, test_ctx_name
+from .test_basic_ops import rand_gpuarray
 from ..elemwise import (GpuElemwise, GpuDimShuffle,
                        GpuCAReduceCuda, GpuCAReduceCPY)
-from ..type import GpuArrayType
+from ..type import GpuArrayType, get_context

 from pygpu import ndgpuarray as gpuarray


 # This is acutally a test for GpuElemwise
-class test_gpu_Broadcast(test_Broadcast):
+class test_gpu_Broadcast(test_elemwise.test_Broadcast):
    op = GpuElemwise
    type = GpuArrayType
    cop = GpuElemwise
@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast):
    linkers = [gof.PerformLinker, gof.CLinker]

    def setUp(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            self.linkers = [gof.PerformLinker]

    def rand_val(self, shp):
@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))

    def test_c(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")
        super(test_gpu_Broadcast, self).test_c()

    def test_c_inplace(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")
        super(test_gpu_Broadcast, self).test_c_inplace()

@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast):
 def test_elemwise_pow():
    # Test that GpuElemwise(pow) can compile with any combination of integer
    # or float input dtype.
-    dev = theano.sandbox.gpuarray.init_dev.device
-    if not dev.startswith('cuda'):
+    if get_context(test_ctx_name).kind != 'cuda':
        raise SkipTest("Cuda specific tests")

    dtypes = ["uint8", "uint16", "uint32", "uint64",
@@ -77,11 +73,11 @@ def test_elemwise_pow():
            assert_allclose(out, expected_out)


-class test_GpuDimShuffle(test_DimShuffle):
+class test_GpuDimShuffle(test_elemwise.test_DimShuffle):
    op = GpuDimShuffle


-class test_GpuCAReduceCPY(test_CAReduce):
+class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
    dtypes = ["float32"]
    bin_dtypes = ["uint8", "int8"]
    op = GpuCAReduceCPY
@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce):

    def test_infer_shape(self):
        for dtype in self.dtypes:
-            test_CAReduce.test_infer_shape(self, dtype)
+            super(test_GpuCAReduceCPY, self).test_infer_shape(dtype)


 class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((5, 6), (1, )),
             ((5, 6), (-1, )),
             ((5, 6), (-2, )),
-             #((5, 6), ()),  #reduce on no axis(copy) isn't implemented
-             #((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
-             #((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
+             # ((5, 6), ()),  #reduce on no axis(copy) isn't implemented
+             # ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
+             # ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
             ((5, 0), None),
             ((5, 0), (0, )),
             ((5, 0), (1, )),
-             #((5, 0), ()), reduce on no axis isn't implemented
-             #((), None), reduce on no axis isn't implemented
-             #((), ()) reduce on no axis isn't implemented
+             # ((5, 0), ()), reduce on no axis isn't implemented
+             # ((), None), reduce on no axis isn't implemented
+             # ((), ()) reduce on no axis isn't implemented

             # Test all GPU cases implemented
             ((1, 0), (1,)),
@@ -158,7 +154,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((0, 0, 0, 0), [0, 1, 2, 3]),
             ((5, 4, 3, 20), [2, 3]), ((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]), ((5, 4, 3, 2), [1, 2, 3]),

-                               # test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
+             # test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
             ((4100, 3), [0]), ((3, 4101), [0]),  # 10
             ((1024, 33), [0]), ((33, 1024), [0]),  # 10
             ((1025, 33), [0]), ((33, 1025), [0]),  # 10
@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]),  # 001
             ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]),  # 110
             ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]),  # 011
-             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+             # ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
             ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]),  # 111
             ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]),  # 111

@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):

             # test pattern implemented by reshape
             # Skip them as this test the op directly, not the optimization with reshape
-#             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
-#             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
-#             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
-#             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-#             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
-#             ((5,4,3,10,11),[1,2]),
-    ]
+             # ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
+             # ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
+             # ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
+             # ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+             # ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
+             # ((5,4,3,10,11),[1,2]),
+             ]
    op = GpuCAReduceCuda
    reds = [scalar.add, scalar.mul,
            scalar.maximum, scalar.minimum]
-    pre_scalar_op = scalar.sqr
+    pre_scalar_op = None

    def test_perform(self):
        return
@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):

    def setUp(self):
        super(test_GpuCAReduceCuda, self).setUp()
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")


-class T_gpureduce_dtype(T_reduce_dtype):
+class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
    op = GpuCAReduceCuda
    # Currently we don't support reduction on 0 axis
@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype):
              'float32', 'float64']

    def setUp(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")



--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
-
 from theano.tensor.nnet.tests import test_neighbours
-# We let that import do the init of the back-end if needed.
-from .test_basic_ops import mode_with_gpu
+
+from .config import mode_with_gpu

 from ..neighbours import GpuImages2Neibs


--- a/theano/sandbox/gpuarray/tests/test_nerv.py
+++ b/theano/sandbox/gpuarray/tests/test_nerv.py
@@ -6,7 +6,7 @@ from theano import function
 from theano.tests import unittest_tools as utt
 from theano.tensor import vector, matrix, dot

-from .test_basic_ops import mode_with_gpu
+from .config import mode_with_gpu
 from ..nerv import Gemm16, nerv



--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -7,9 +7,7 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt

-# We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
-                             mode_without_gpu)
+from .config import mode_with_gpu, mode_without_gpu
 from ..nnet import (
    GpuCrossentropySoftmaxArgmax1HotWithBias,
    GpuCrossentropySoftmax1HotWithBiasDx,

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -4,17 +4,16 @@ import theano
 from theano import tensor
 from theano.tests.breakpoint import PdbBreakpoint
 from theano.tests import unittest_tools as utt
-from theano.tests.unittest_tools import SkipTest
 from theano.tensor.tests import test_basic

 import theano.sandbox.gpuarray
 from .. import basic_ops
-from ..type import GpuArrayType, gpuarray_shared_constructor
-from ..basic_ops import (GpuAlloc, GpuReshape, gpu_alloc,
-                         gpu_from_host, host_from_gpu)
+from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
+from ..basic_ops import GpuAlloc, GpuReshape, GpuFromHost, host_from_gpu
 from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
 from ..subtensor import GpuSubtensor
-from .test_basic_ops import rand_gpuarray, mode_with_gpu, mode_without_gpu
+
+from .config import mode_with_gpu, test_ctx_name


 def test_local_assert():
@@ -97,7 +96,7 @@ def test_flatten():


 def test_reduce():
-    dev = theano.sandbox.gpuarray.init_dev.device
+    kind = get_context(test_ctx_name).kind

    for method, param in [('sum', dict(acc_dtype='float32')),
                          ('prod', dict(acc_dtype='float32')),
@@ -113,7 +112,7 @@ def test_reduce():
        topo = f.maker.fgraph.toposort()
        ops = [type(node.op) for node in topo]

-        if dev.startswith('opencl') and method in ["max", "min"]:
+        if kind == 'opencl' and method in ["max", "min"]:
            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
        else:
            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0():
    ones = numpy.ones((2,), dtype='float32')

    # Test with 0
-    a = gpu_alloc(z, i)
+    a = GpuAlloc(test_ctx_name)(z, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0():
    assert (numpy.asarray(f(6)) == 0).all()

    # Test with 1
-    a = gpu_alloc(o, i)
+    a = GpuAlloc(test_ctx_name)(o, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0():
    assert (numpy.asarray(f(6)) == 1).all()

    # Test with 1, 1
-    a = gpu_alloc(ones, i)
+    a = GpuAlloc(test_ctx_name)(ones, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
@@ -180,7 +179,7 @@ def test_print_op():
    f = theano.function([b], theano.printing.Print()(b) * 2,
                        mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert topo[0].op == gpu_from_host
+    assert isinstance(topo[0].op, GpuFromHost)
    assert isinstance(topo[1].op, theano.printing.Print)
    assert isinstance(topo[2].op, GpuElemwise)
    assert topo[3].op == host_from_gpu
@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op():

 def test_local_gpu_elemwise_careduce():
    x = theano.tensor.matrix()
-    o = (x*x).sum()
+    o = (x * x).sum()
    f = theano.function([x], o, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
@@ -234,7 +233,7 @@ def test_local_gpu_subtensor():
    # Test multiple use of the input
    # We want the subtensor to be on the GPU to prevent multiple transfer.
    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+    f = theano.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
@@ -242,7 +241,7 @@ def test_local_gpu_subtensor():
    # Test multiple use of the input + input as output
    # We want the subtensor to be on the GPU to prevent multiple transfer.
    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+    f = theano.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
@@ -250,7 +249,7 @@ def test_local_gpu_subtensor():
    # Test shared forced on CPU end we do computation on the output of
    # the subtensor.
    t = tensor._shared(numpy.zeros(20, "float32"))
-    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+    f = theano.function([], t[3:4] + 1, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
@@ -319,7 +318,7 @@ def test_local_gpu_elemwise():
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
-    c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32'))
+    c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype='float32'))
    f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
                        mode=mode_with_gpu)
    out = f(a_v, b_v)

--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py