Merge pull request #3482 from abergeron/multi_gpu_new2

Multi-gpu support

Merge pull request #3482 from abergeron/multi_gpu_new2
4814cd99 · Pascal Lamblin · 6ca7b2b6 · ec927f7d · 4814cd99 · 4814cd99
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
 if (config.device.startswith('cuda') or
        config.device.startswith('opencl') or
        config.init_gpu_device.startswith('cuda') or
-        config.init_gpu_device.startswith('opencl')):
+        config.init_gpu_device.startswith('opencl') or
+        config.contexts != ''):
    import theano.sandbox.gpuarray

 # Use config.numpy to call numpy.seterr

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -111,6 +111,29 @@ AddConfigVar(
    BoolParam(False, allow_override=False),
    in_c_key=False)

+
+class ContextsParam(ConfigParam):
+    def __init__(self):
+        def filter(val):
+            if val == '':
+                return val
+            for v in val.split(';'):
+                s = v.split('->')
+                if len(s) != 2:
+                    raise ValueError("Malformed context map: %s" % (v,))
+            return val
+        ConfigParam.__init__(self, '', filter, False)
+
+AddConfigVar(
+    'contexts',
+    """
+    Context map for multi-gpu operation. Format is a
+    semicolon-separated list of names and device names in the
+    'name->dev_name' format. An example that would map name 'test' to
+    device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
+    "test->cuda0;test2->opencl0:0".
+    """, ContextsParam(), in_c_key=False)
+
 AddConfigVar(
    'print_active_device',
    "Print active device at when the GPU device is initialized.",

--- a/theano/misc/check_multi_gpu.py
+++ b/theano/misc/check_multi_gpu.py
+#! /usr/bin/env python
+"""
+This file compare the runtime of two independent dot products on one
+and two GPU to measure the speedup.
+
+This should be 2x if the GPUs are equivalent.
+"""
+import time
+
+import numpy
+
+import theano
+from theano.sandbox.gpuarray import init_dev
+from theano.sandbox.gpuarray.type import gpuarray_shared_constructor as shared
+from theano.sandbox.gpuarray.blas import gpu_dot22
+
+
+def main(dev1, dev2):
+    init_dev(dev1, 'ctx1')
+    init_dev(dev2, 'ctx2')
+
+    val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+
+    val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx2')
+    val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx2')
+
+    f1 = theano.function([], [gpu_dot22(val1a, val1b),
+                              gpu_dot22(val1c, val1d)])
+    f2 = theano.function([], [gpu_dot22(val1a, val1b),
+                              gpu_dot22(val2a, val2b)])
+
+    r = f1()
+    r[0].sync(), r[1].sync()
+    r = None
+    t = time.time()
+    r = f1()
+    r[0].sync(), r[1].sync()
+    t2 = time.time()
+    r = None
+
+    print("one ctx %f" % (t2 - t,))
+
+    r = f2()
+    r[0].sync(), r[1].sync()
+    r = None
+    t = time.time()
+    r = f2()
+    r[0].sync(), r[1].sync()
+    t2 = time.time()
+    r = None
+
+    print("two ctx %f" % (t2 - t,))
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 3:
+        raise ValueError("This script require two device names.")
+    main(sys.argv[1], sys.argv[2])
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, tensor.TensorType):
-            return [gpu_from_host(ev)]
-        else:
-            return [ev]
+        return self(ev)

    def infer_shape(self, node, xshp):
        return xshp
@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, CudaNdarrayType):
-            return [host_from_gpu(ev)]
-        else:
-            return [ev]
+        self(ev)

    def infer_shape(self, node, xshp):
        return xshp

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
    return False


+def typeInfer(node):
+    return typeConstructor
+
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
+               scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
                                             gpu_flag=True),
               75,
               'gpu',

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -21,26 +21,30 @@ except ImportError:

 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
-                  GpuArraySharedVariable, gpuarray_shared_constructor)
+                   GpuArraySharedVariable, gpuarray_shared_constructor,
+                   reg_context)
 from . import opt, nerv


-def init_dev(dev):
+def init_dev(dev, name=None):
    if pygpu.gpuarray.api_version() != (-10000, 0):
        raise RuntimeError("Wrong API version for gpuarray:",
                           pygpu.gpuarray.api_version(),
                           "Make sure Theano and libgpuarray/pygpu "
                           "are in sync.")
    global pygpu_activated
-    context = pygpu.init(dev)
-    pygpu.set_default_context(context)
+    if dev not in init_dev.devmap:
+        init_dev.devmap[dev] = pygpu.init(dev)
+    context = init_dev.devmap[dev]
+    # This will map the context name to the real context object.
+    reg_context(name, context)
    pygpu_activated = True
    if config.print_active_device:
-        print("Using device %s: %s" % (dev, context.devname), file=sys.stderr)
-    # remember the active device
-    init_dev.device = dev
+        print("Mapped name %s to device %s: %s" % (name, dev, context.devname),
+              file=sys.stderr)

-init_dev.device = None
+# This maps things like 'cuda0' to the context object on that device.
+init_dev.devmap = {}

 if pygpu:
    try:
@@ -52,11 +56,21 @@ if pygpu:
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
        elif (config.init_gpu_device.startswith('cuda') or
              config.init_gpu_device.startswith('opencl')):
+            if config.device != 'cpu':
+                raise ValueError('you must set device=cpu to use init_gpu_device.')
+            if config.contexts != '':
+                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
            init_dev(config.init_gpu_device)
+        if config.contexts != '':
+            for n, d in (c.split('->') for c in config.contexts.split(';')):
+                init_dev(d.strip(), n.strip())
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')

        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
-        from .basic_ops import host_from_gpu, gpu_from_host
+        from .basic_ops import host_from_gpu, GpuFromHost
        from .elemwise import GpuElemwise
        from .subtensor import (GpuSubtensor, GpuIncSubtensor,
                                GpuAdvancedIncSubtensor1)
@@ -67,5 +81,6 @@ else:
    if (config.init_gpu_device.startswith('cuda') or
            config.init_gpu_device.startswith('opencl') or
            config.device.startswith('opencl') or
-            config.device.startswith('cuda')):
+            config.device.startswith('cuda') or
+            config.contexts != ''):
        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -9,7 +9,9 @@ from theano.tensor.basic import Alloc, Join, Split

 from theano.gof import HideC
 from theano.gof.utils import MethodNotDefined
-from theano.compat import PY3
+
+from collections import deque
+
 from six import string_types
 from six.moves import xrange

@@ -19,27 +21,83 @@ try:
 except ImportError:
    pass

-from .type import GpuArrayType
+from .type import GpuArrayType, GpuArrayConstant, gpu_context_type, get_context
 from .fp16_help import write_w


-def as_gpuarray_variable(x):
+def as_gpuarray_variable(x, context_name):
+    # If this is already some form of variable, try to avoid an extra transfer
+    if isinstance(x, Variable):
+        while True:
+            # If we are already a GpuArrayVariable in the right context
+            # then there is nothing to do.
+            if (isinstance(x.type, GpuArrayType) and
+                    x.type.context_name == context_name):
+                return x
+
+            # If x is the result of a transfer, try to dig through.
            if getattr(x, 'owner', None):
                if isinstance(x.owner.op, HostFromGpu):
-            return x.owner.inputs[0]
-        elif (isinstance(x.owner.op, GpuFromHost) and
-              x.owner.inputs[0].owner and
-              isinstance(x.owner.inputs[0].owner.op, HostFromGpu)):
-            return x.owner.inputs[0].owner.inputs[0]
+                    x = x.owner.inputs[0]
+                    continue
+                if isinstance(x.owner.op, GpuFromHost):
+                    x = x.owner.inputs[0]
+                    continue
+                if isinstance(x.owner.op, GpuToGpu):
+                    x = x.owner.inputs[0]
+                    continue
+
+            # If none of the conditions where met, then continue with
+            # the rest of the body
+            break
+
+        # If we couldn't deal with transfers, then maybe it's a tensor
+        if isinstance(x.type, tensor.TensorType):
+            return GpuFromHost(context_name)(x)
+
+    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
-        return x._as_GpuArrayVariable()
-    # TODO we need to have the cuda -> gpu path taken care of.
-    tensor_x = tensor.as_tensor_variable(x)
-    return gpu_from_host(tensor_x)
+        return x._as_GpuArrayVariable(context_name)

+    # If it didn't work try for a constant
+    ctx = get_context(context_name)

-def as_gpuarray(x):
-    return gpuarray.array(x, copy=False)
+    if isinstance(x, gpuarray.GpuArray):
+        if x.context.ptr != ctx.ptr:
+            x = x.transfer(ctx)
+
+    x = gpuarray.asarray(x, context=ctx)
+
+    bcast = [(s == 1) for s in x.shape]
+    return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
+                                         broadcastable=bcast,
+                                         context_name=context_name),
+                            x)
+
+
+def infer_context_name(*vars):
+    """
+    Infer the context name to use from the inputs given
+
+    """
+    # We try to infer the closest context first
+    # TODO: What to do in case of context conflicts?
+    #       We currently use a first found wins approach.
+    todo = deque()
+    todo.extendleft(vars)
+    while todo:
+        v = todo.pop()
+        if isinstance(v.type, GpuArrayType):
+            return v.type.context_name
+        if hasattr(v.tag, 'context_name'):
+            return v.tag.context_name
+        if v.owner:
+            if isinstance(v.owner.op, HostFromGpu):
+                return v.owner.inputs[0].type.context_name
+            if len(v.owner.inputs) == 1:
+                todo.extendleft(v.owner.inputs)
+    # If we can't find a context we infer None, which is the default
+    return None


 class Kernel(object):
@@ -111,10 +169,12 @@ class Kernel(object):


 class GpuKernelBase(object):
+    context_type = gpu_context_type
+
    def gpu_kernels(self, node, name):
        """
-        This is the method to override. This should return an iterable of Kernel
-        objects that describe the kernels this op will need.
+        This is the method to override. This should return an iterable
+        of Kernel objects that describe the kernels this op will need.

        """
        raise MethodNotDefined('gpu_kernels')
@@ -126,8 +186,9 @@ class GpuKernelBase(object):
            o = []
        return o + ['gpuarray/types.h']

-    def _generate_kernel_bin(self, k):
-        gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
+    def _generate_kernel_bin(self, k, ctx):
+        gk = gpuarray.GpuKernel(k.code, k.name, k.params, context=ctx,
+                                **k.flags)
        bin = gk._binary
        bcode = ','.join(hex(ord(c)) for c in bin)
        return ("""static const char %(bname)s[] = { %(bcode)s };""" %
@@ -140,7 +201,7 @@ class GpuKernelBase(object):
                dict(cname=k.codevar, code=code))

    def _generate_kernel_vars(self, k):
-        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
+        return """GpuKernel %(kname)s;""" % dict(kname=k.objvar)

    def c_support_code(self):
        return """
@@ -153,46 +214,62 @@ class GpuKernelBase(object):

    def c_support_code_apply(self, node, name):
        kernels = self.gpu_kernels(node, name)
-        bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
+        ctx = self.get_context(node)
+        bins = '\n'.join(self._generate_kernel_bin(k, ctx) for k in kernels)
        codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
-        vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
-        return '\n'.join([bins, codes, vars])
+        return '\n'.join([bins, codes])

-    def _generate_kernel_init(self, k, err):
-        if PY3:
-            error_out = "NULL"
-        else:
-            error_out = ""
+    def c_support_code_struct(self, node, name):
+        kernels = self.gpu_kernels(node, name)
+        return '\n'.join(self._generate_kernel_vars(k) for k in kernels)
+
+    def _generate_zeros(self, k):
+        return """memset(&%(v)s, 0, sizeof(%(v)s));""" % dict(v=k.objvar)
+
+    def _generate_kernel_init(self, k, fail, ctx):
        return """{
+  int err;
  int types[%(numargs)u] = {%(types)s};
  const char *bcode = %(bvar)s;
  size_t sz = sizeof(%(bvar)s);
-  PyGpuContextObject *c = pygpu_default_context();
-  if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
-                     %(numargs)u, types, GA_USE_BINARY, NULL) != GA_NO_ERROR) {
-    if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
-                                  NULL, "%(kname)s", %(numargs)u, types,
-                                  %(flags)s, NULL)) != GA_NO_ERROR) {
+  if (GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1, &bcode, &sz,
+                     "%(kname)s", %(numargs)u, types, GA_USE_BINARY, NULL)
+      != GA_NO_ERROR) {
+    if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1,
+                              &%(cname)s, NULL, "%(kname)s", %(numargs)u,
+                              types, %(flags)s, NULL)) != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
-                   %(err)s, Gpu_error(c->ops, c->ctx, %(err)s));
-      return %(error_out)s;
+                   err, Gpu_error(%(ctx)s->ops, %(ctx)s->ctx, err));
+      %(fail)s
    }
  }
 }""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
-            ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
-            flags=k._get_c_flags(), error_out=error_out)
+            ovar=k.objvar, kname=k.name, cname=k.codevar,
+            flags=k._get_c_flags(), fail=fail, ctx=ctx)

-    def c_init_code_apply(self, node, name):
-        err = 'err_' + name
+    def c_init_code_struct(self, node, name, sub):
+        ctx = sub['context']
        kernels = self.gpu_kernels(node, name)
-        inits = '\n'.join(self._generate_kernel_init(k, err) for k in kernels)
-        return ("int %(err)s;\n" % dict(err=err)) + inits
+        inits_0 = '\n'.join(self._generate_zeros(k) for k in kernels)
+        inits = '\n'.join(self._generate_kernel_init(k, sub['fail'], ctx)
+                          for k in kernels)
+        return '\n'.join([inits_0, inits])
+
+    def _generate_kernel_cleanup(self, k):
+        return "GpuKernel_clear(&%(ovar)s);" % dict(ovar=k.objvar)

-    def _GpuKernelBase_version(self):
-        ctx = gpuarray.get_default_context()
-        return (2, ctx.kind, ctx.devname)
+    def c_cleanup_code_struct(self, node, name):
+        kernels = self.gpu_kernels(node, name)
+        cleanups = '\n'.join(self._generate_kernel_cleanup(k) for k in kernels)
+        return cleanups

-    GpuKernelBase_version = property(_GpuKernelBase_version)
+    # This is a shorthand for if your op only has a fixed version
+    # You can reimplement it, but make sure to call kernel_version()
+    def c_code_cache_version_apply(self, node):
+        return (self.c_code_cache_version(), self.kernel_version(node))
+
+    def kernel_version(self, node):
+        return (3, node.get_context().bin_id)


 class HostFromGpu(Op):
@@ -259,50 +336,52 @@ class HostFromGpu(Op):

    def grad(self, inputs, grads):
        gz, = grads
-        return [gpu_from_host(gz)]
+        return [GpuFromHost(inputs[0].type.context_name)(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, tensor.TensorType):
-            return [gpu_from_host(ev)]
-        else:
-            return [ev]
+        return self(ev)

    def infer_shape(self, node, xshp):
        return xshp

-
 host_from_gpu = HostFromGpu()


 class GpuFromHost(Op):
-    __props__ = ()
+    __props__ = ('context_name',)
    _f16_ok = True
+    context_type = gpu_context_type
+
+    def __init__(self, context_name):
+        self.context_name = context_name

    def __str__(self):
-        return 'GpuFromHost(gpuarray)'
+        return 'GpuFromHost<%s>' % (self.context_name,)

    def make_node(self, x):
        if not isinstance(x.type, tensor.TensorType):
            raise TypeError(x)
        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
+                                              context_name=self.context_name,
                                              dtype=x.dtype)()])

-    def perform(self, node, inp, out):
+    def get_context(self, node):
+        return get_context(self.context_name)
+
+    def perform(self, node, inp, out, ctx):
        x, = inp
        z, = out
-        z[0] = gpuarray.array(x)
+        z[0] = gpuarray.array(x, context=ctx)

    def grad(self, inputs, grads):
        gz, = grads
-        return [host_from_gpu(as_gpuarray_variable(gz))]
+        return [host_from_gpu(as_gpuarray_variable(
+                gz, context_name=self.context_name))]

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, GpuArrayType):
-            return [host_from_gpu(ev)]
-        else:
-            return [ev]
+        return self(ev)

    def infer_shape(self, node, xshp):
        return xshp
@@ -319,19 +398,67 @@ class GpuFromHost(Op):
                                     PyArray_NDIM(%(name)s_tmp),
                                     (size_t *)PyArray_DIMS(%(name)s_tmp),
                                     (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
-                                     pygpu_default_context(),
+                                     %(ctx)s,
                                     Py_None);
-
        Py_DECREF(%(name)s_tmp);
-        if (%(out)s == NULL)
+        if (%(out)s == NULL) {
            %(fail)s
-        """ % {'name': name, 'inp': inputs[0],
+        }
+        """ % {'name': name, 'inp': inputs[0], 'ctx': sub['context'],
               'out': outputs[0], 'fail': sub['fail']}

    def c_code_cache_version(self):
-        return (5,)
+        return (7,)
+
+
+class GpuToGpu(Op):
+    __props__ = ('context_name',)
+    _f16_ok = True
+    context_type = gpu_context_type
+
+    def __init__(self, context_name):
+        self.context_name = context_name
+
+    def __str__(self):
+        return 'GpuToGpu<%s>' % (self.context_name,)
+
+    def make_node(self, x):
+        if not isinstance(x.type, GpuArrayType):
+            raise TypeError(x)
+        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
+                                              context_name=self.context_name,
+                                              dtype=x.dtype)()])

-gpu_from_host = GpuFromHost()
+    def get_context(self, node):
+        return get_context(self.context_name)
+
+    def perform(self, node, inp, out, ctx):
+        x, = inp
+        z, = out
+        z[0] = x.transfer(ctx)
+
+    def grad(self, inputs, grads):
+        gz, = grads
+        return [GpuToGpu(inputs[0].type.context_name)(gz)]
+
+    def R_op(self, inputs, eval_points):
+        return self(eval_points[0])
+
+    def infer_shape(self, node, xshp):
+        return xshp
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        return """
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_transfer(%(inp)s, %(ctx)s, 0);
+        if (%(out)s == NULL) {
+            %(fail)s
+        }
+        """ % {'inp': inputs[0], 'ctx': sub['context'],
+               'out': outputs[0], 'fail': sub['fail']}
+
+    def c_code_cache_version(self):
+        return (0,)


 class GpuAlloc(HideC, Alloc):
@@ -339,28 +466,35 @@ class GpuAlloc(HideC, Alloc):

    Parameters
    ----------
-    memset_0
+    context_name : str
+        The name of the context in which to allocate memory
+    memset_0 : bool
        It's only an optimized version. True, it means the
        value is always 0, so the c code call memset as it is faster.

    """

-    __props__ = ('memset_0',)
+    __props__ = ('memset_0', 'context_name')
    _f16_ok = True
+    context_type = gpu_context_type

-    def __init__(self, memset_0=False):
+    def __init__(self, context_name, memset_0=False):
+        self.context_name = context_name
        self.memset_0 = memset_0

+    def get_context(self, node):
+        return get_context(self.context_name)
+
    def __str__(self):
        # Hide the memset parameter when not used to prevent confusion.
        if self.memset_0:
-            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
+            m = "{memset_0=True}"
        else:
-            s = self.__class__.__name__
-        return s
+            m = ""
+        return "%s<%s>%s" % (self.__class__.__name__, self.context_name, m)

    def make_node(self, value, *shape):
-        value = as_gpuarray_variable(value)
+        value = as_gpuarray_variable(value, context_name=self.context_name)
        sh, bcast = self.validate_shape(shape)
        if value.ndim > len(sh):
            TypeError("The GpuAlloc value to use has more dimensions "
@@ -371,15 +505,15 @@ class GpuAlloc(HideC, Alloc):
    def c_headers(self):
        return ['<numpy_compat.h>']

-    def perform(self, node, inputs, outs):
+    def perform(self, node, inputs, outs, ctx):
        out, = outs
        v = inputs[0]
        sh = tuple(map(int, inputs[1:]))
        if out[0] is None or out[0].shape != sh:
            if self.memset_0:
-                out[0] = gpuarray.zeros(sh, dtype=v.dtype)
+                out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=ctx)
            else:
-                out[0] = gpuarray.empty(sh, dtype=v.dtype)
+                out[0] = gpuarray.empty(sh, dtype=v.dtype, context=ctx)
                out[0][...] = v
        else:
            out[0][...] = v
@@ -414,7 +548,7 @@ class GpuAlloc(HideC, Alloc):
            Py_XDECREF(%(zz)s);
            %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
                                 %(vv)s->ga.typecode, GA_C_ORDER,
-                                 pygpu_default_context(), Py_None);
+                                 %(ctx)s, Py_None);
            if (!%(zz)s) {
                %(fail)s
            }
@@ -423,7 +557,7 @@ class GpuAlloc(HideC, Alloc):
                Py_XDECREF(%(zz)s);
                %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
                                     %(vv)s->ga.typecode, GA_C_ORDER,
-                                     pygpu_default_context(), Py_None);
+                                     %(ctx)s, Py_None);
                if (!%(zz)s) {
                    %(fail)s
                }
@@ -434,9 +568,9 @@ class GpuAlloc(HideC, Alloc):
                if (err != GA_NO_ERROR)
                {
                    PyErr_Format(PyExc_MemoryError,
-                                 "GpuAlloc: Error memsetting %%d"
+                                 "GpuAlloc: Error memsetting %%llu"
                                 " element of device memory to 0.",
-                                 PyGpuArray_SIZE(%(zz)s));
+                                 (unsigned long long)PyGpuArray_SIZE(%(zz)s));
                    %(fail)s;
                }
            }
@@ -446,7 +580,7 @@ class GpuAlloc(HideC, Alloc):
                %(fail)s
            }
        }
-        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv,
+        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['context'],
                   fail=sub['fail'], memset_0=memset_0)

        if config.gpuarray.sync:
@@ -455,7 +589,7 @@ class GpuAlloc(HideC, Alloc):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)

    def do_constant_folding(self, node):
        from . import subtensor, blas
@@ -488,29 +622,32 @@ class GpuAlloc(HideC, Alloc):
        return True


-gpu_alloc = GpuAlloc()
-
-
 class GpuAllocEmpty(HideC, Alloc):
-    __props__ = ('dtype',)
+    __props__ = ('dtype', 'context_name')
    _f16_ok = True
+    context_type = gpu_context_type

-    def __init__(self, dtype):
+    def __init__(self, dtype, context_name):
        self.dtype = dtype
+        self.context_name = context_name
+
+    def get_context(self, node):
+        return get_context(self.context_name)

    def make_node(self, *shape):
        sh, bcast = self.validate_shape(shape)
-        output = GpuArrayType(dtype=self.dtype, broadcastable=bcast)()
+        output = GpuArrayType(dtype=self.dtype, broadcastable=bcast,
+                              context_name=self.context_name)()
        output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
        # The outut can contain nan/inf.
        output.type.filter_checks_isfinite = False
        return Apply(self, sh, [output])

-    def perform(self, node, inputs, out_):
+    def perform(self, node, inputs, out_, ctx):
        out = out_[0]
        sh = [int(i) for i in inputs]
        if out[0] is None or out[0].shape != sh:
-            out[0] = pygpu.empty(sh, dtype=self.dtype)
+            out[0] = pygpu.empty(sh, dtype=self.dtype, context=ctx)
        # if out[0] is the right shape, we just return it

    def c_headers(self):
@@ -536,16 +673,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];

        code.append("""
 if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
-                       pygpu_default_context())) {
+                       %(ctx)s)) {
  %(fail)s
 }
 """ % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype),
-           fail=fail))
+           fail=fail, ctx=sub['context']))

        return ''.join(code)

    def c_code_cache_version(self):
-        return (0,)
+        return (1,)

    def do_constant_folding(self, node):
        return False
@@ -559,7 +696,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,


 def empty_like(var):
-    return GpuAllocEmpty(var.type.dtype)(*var.shape)
+    return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)


 class GpuContiguous(Op):
@@ -568,7 +705,6 @@ class GpuContiguous(Op):
    not already c contiguous.

    """
-
    __props__ = ()
    view_map = {0: [0]}
    _f16_ok = True
@@ -576,12 +712,13 @@ class GpuContiguous(Op):
    def grad(self, inputs, dout):
        x, = inputs
        dout, = dout
-        dout = as_gpuarray_variable(dout)
+        dout = as_gpuarray_variable(dout, context_name=infer_context_name(x))

        return [dout]

    def make_node(self, input):
-        input = as_gpuarray_variable(input)
+        input = as_gpuarray_variable(input,
+                                     context_name=infer_context_name(input))
        return Apply(self, [input], [input.type()])

    def c_headers(self):
@@ -633,10 +770,12 @@ class GpuReshape(HideC, tensor.Reshape):

    # __hash__, __eq__, __str__ come from tensor.Reshape
    def make_node(self, x, shp):
-        x = as_gpuarray_variable(x)
+        ctx_name = infer_context_name(x)
+        x = as_gpuarray_variable(x, context_name=ctx_name)
        res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
        otype = GpuArrayType(dtype=res.dtype,
-                             broadcastable=res.broadcastable)
+                             broadcastable=res.broadcastable,
+                             context_name=ctx_name)
        return Apply(self, [x, shp], [otype()])

    def perform(self, node, inp, out_):
@@ -744,22 +883,30 @@ class GpuReshape(HideC, tensor.Reshape):


 class GpuJoin(HideC, Join):
-
    _f16_ok = True
+    context_type = gpu_context_type

    def make_node(self, axis, *tensors):
        node = Join.make_node(self, axis, *tensors)

-        return Apply(self, [node.inputs[0]] + list(map(as_gpuarray_variable,
-                                                       tensors)),
+        ctx_name = infer_context_name(*tensors)
+
+        def agv(v):
+            return as_gpuarray_variable(v, context_name=ctx_name)
+
+        return Apply(self, [node.inputs[0]] + list(map(agv, tensors)),
                     [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
-                                   dtype=node.outputs[0].dtype)()])
+                                   dtype=node.outputs[0].dtype,
+                                   context_name=ctx_name)()])
+
+    def get_context(self, node):
+        return node.outputs[0].type.context

-    def perform(self, node, axis_and_tensors, out_):
+    def perform(self, node, axis_and_tensors, out_, ctx):
        out, = out_
        axis = int(axis_and_tensors[0])
        tensors = axis_and_tensors[1:]
-        out[0] = pygpu.concatenate(tensors, axis=axis).astype(
+        out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
            node.outputs[0].dtype)

    def c_code_cache_version(self):
@@ -793,15 +940,14 @@ if (axis < 0) {
 }
 %(out)s = pygpu_concatenate(als, %(n)s, axis,
                            %(restype)s, (PyObject *)&PyGpuArrayType,
-                            pygpu_default_context());
+                            %(ctx)s);
 }
 PyMem_Free(als);
 if (%(out)s == NULL)
  %(fail)s
        """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
                   axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
-                   restype=restype)
-
+                   restype=restype, ctx=sub['context'])

 gpu_join = GpuJoin()

@@ -809,21 +955,26 @@ gpu_join = GpuJoin()
 class GpuSplit(HideC, Split):
    def make_node(self, x, axis, splits):
        node = Split.make_node(self, x, axis, splits)
-        x = as_gpuarray_variable(x)
-        outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)()
+        x = as_gpuarray_variable(x, infer_context_name(x))
+        outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable,
+                             context_name=x.type.context_name)()
                for o in node.outputs]
        return Apply(self, [x] + node.inputs[1:], outs)
    # we reuse the perform of the CPU op, which is suitable


 class GpuEye(GpuKernelBase, Op):
-    __props__ = ('dtype',)
+    __props__ = ('dtype', 'context_name')
    _f16_ok = True

-    def __init__(self, dtype=None):
+    def __init__(self, dtype=None, context_name=None):
        if dtype is None:
            dtype = config.floatX
        self.dtype = dtype
+        self.context_name = context_name
+
+    def get_context(self, node):
+        return get_context(self.context_name)

    def make_node(self, n, m, k):
        n = tensor.as_tensor_variable(n)
@@ -833,7 +984,8 @@ class GpuEye(GpuKernelBase, Op):
        assert m.ndim == 0
        assert k.ndim == 0
        otype = GpuArrayType(dtype=self.dtype,
-                             broadcastable=(False, False))
+                             broadcastable=(False, False),
+                             context_name=self.context_name)

        # k != 0 isn't implemented on the GPU yet.
        assert tensor.get_scalar_constant_value(k) == 0
@@ -866,6 +1018,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
        n, m = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
        sync = bool(config.gpuarray.sync)
        kname = self.gpu_kernels(node, name)[0].objvar
@@ -882,7 +1035,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
        %(z)s = pygpu_zeros(2, dims,
                            %(typecode)s,
                            GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
+                            %(ctx)s, Py_None);
        if (%(z)s == NULL) {
            %(fail)s
        }
@@ -908,4 +1061,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
        return s

    def c_code_cache_version(self):
-        return (4, self.GpuKernelBase_version)
+        return (5,)
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 import os.path

-from theano import Apply, config
+from theano import Apply, config, Op

 from theano.compile import optdb
-from theano.gof import local_optimizer, LocalOptGroup
+from theano.gof import LocalOptGroup
 from theano.tensor.basic import as_tensor_variable
-from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.tensor.opt import in2out

-from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
+from .basic_ops import as_gpuarray_variable, infer_context_name
+
+from .opt_util import inplace_allocempty

 try:
    import pygpu
@@ -18,7 +19,7 @@ except ImportError as e:
    pass


-class BlasOp(HideC):
+class BlasOp(Op):
    def c_headers(self):
        return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']

@@ -28,34 +29,27 @@ class BlasOp(HideC):
    def c_init_code(self):
        return ['import_pygpu__blas();']

-    def c_support_code(self):
-        return """
-PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
-                                   PyGpuArrayObject *y) {
-  if (out &&
-      GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
-      theano_size_check(out, PyGpuArray_NDIM(y),
-                        PyGpuArray_DIMS(y),
-                        y->ga.typecode)) {
-    if (pygpu_move(out, y)) {
-      Py_XDECREF(out);
-      return NULL;
-    }
-  } else {
-    Py_XDECREF(out);
-    out = pygpu_copy(y, GA_ANY_ORDER);
-  }
-  return out;
-}
-"""

+class GpuGemv(BlasOp):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}

-class GpuGemv(BlasOp, Gemv):
    def make_node(self, y, alpha, A, x, beta):
-        Gemv.make_node(self, y, alpha, A, x, beta)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(y, A, x)
+        A = as_gpuarray_variable(A, ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 2
+        assert x.ndim == 1
+        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [y, alpha, A, x, beta], [y.type()])

@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
        if self.inplace:
            code = """
                   if (%(y)s->ga.strides[0] <= 0) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(y)s);
+                     %(out)s = theano_try_copy(%(out)s, %(y)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(y)s);
+                   %(out)s = theano_try_copy(%(out)s, %(y)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
        return code

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)


-class GpuGemm(BlasOp, Gemm):
+class GpuGemm(BlasOp):
+    __props__ = ('inplace',)
    _f16_ok = True

+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, C, alpha, A, B, beta):
+        ctx_name = infer_context_name(C, A, B)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)
        alpha = as_tensor_variable(alpha)
        beta = as_tensor_variable(beta)
-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 2
+        assert B.ndim == 2
+        assert C.ndim == 2
        assert A.dtype == B.dtype == C.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
        if self.inplace:
            code = """
                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(C)s);
+                     %(out)s = theano_try_copy(%(out)s, %(C)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(C)s);
+                   %(out)s = theano_try_copy(%(out)s, %(C)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
        return code

    def c_code_cache_version(self):
-        return (4,)
-
+        return (5,)

 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)


-class GpuGer(BlasOp, Ger):
+class GpuGer(BlasOp):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, A, alpha, x, y):
-        Ger.make_node(self, A, alpha, x, y)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(A, x, y)
+        A = as_gpuarray_variable(A, ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        assert alpha.ndim == 0
+        assert A.ndim == 2
+        assert x.ndim == 1
+        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [A, alpha, x, y], [A.type()])

    def perform(self, node, inp, out):
        A, alpha, x, y = inp
-        inplace = self.destructive
+        inplace = self.inplace
        if inplace and not A.flags.forc:
            inplace = False
        out[0][0] = blas.ger(alpha, x, y, A,
@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
                    fail=sub['fail'], name=name)
-        if self.destructive:
+        if self.inplace:
            code = """
                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(A)s);
+                     %(out)s = theano_try_copy(%(out)s, %(A)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(A)s);
+                   %(out)s = theano_try_copy(%(out)s, %(A)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)


-gpuger_no_inplace = GpuGer(destructive=False)
-gpuger_inplace = GpuGer(destructive=True)
+gpuger_no_inplace = GpuGer(inplace=False)
+gpuger_inplace = GpuGer(inplace=True)


-class GpuDot22(BlasOp, Dot22):
+class GpuDot22(BlasOp):
+    __props__ = ()
+
    def make_node(self, x, y):
-        Dot22.make_node(self, x, y)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        assert x.ndim == 2
+        assert y.ndim == 2
        assert x.dtype == y.dtype
        return Apply(self, [x, y], [x.type()])

@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
        dims[1] = PyGpuArray_DIMS(%(B)s)[1];

        if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
-                              pygpu_default_context())) {
+                               %(A)s->context)) {
            %(fail)s
        }

@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
        return code

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

 gpu_dot22 = GpuDot22()


-@local_optimizer([gpugemv_no_inplace], inplace=True)
-def local_inplace_gpuagemv(node):
-    if node.op == gpugemv_no_inplace:
-        return [gpugemv_inplace(*node.inputs)]
+@inplace_allocempty(GpuGemv, 0)
+def local_inplace_gpuagemv(node, inputs):
+    return [gpugemv_inplace(*inputs)]


-@local_optimizer([gpugemm_no_inplace], inplace=True)
-def local_inplace_gpuagemm(node):
-    if node.op == gpugemm_no_inplace:
-        inputs = list(node.inputs)
-        C = inputs[0]
-        if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
-                len(C.clients) > 1):
-            inputs[0] = C.owner.op(*C.owner.inputs)
+@inplace_allocempty(GpuGemm, 0)
+def local_inplace_gpuagemm(node, inputs):
    return [gpugemm_inplace(*inputs)]


-@local_optimizer([gpuger_no_inplace], inplace=True)
-def local_inplace_gpuager(node):
-    if node.op == gpuger_no_inplace:
-        return [gpuger_inplace(*node.inputs)]
+@inplace_allocempty(GpuGer, 0)
+def local_inplace_gpuager(node, inputs):
+    return [gpuger_inplace(*inputs)]

 gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
                                            local_inplace_gpuagemm,

--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
    const int out_size_byte = out_size*sizeof(float);
    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
     PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received had %llud columns!",
+                   " %d kernel columns, but the kernel we received had %llu columns!",
                   THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
      return -1;
    }
@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i,"
-                      " n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_patch' failed (%s),"
@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

        if (err == GA_NO_ERROR)
        {
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false, kern_width=%i,"
-                      " img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1],
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i,"
-                      " kern_flipped=true, accumulate=false,"
-                      " kern_width=%i, img_c_contiguous_2d=%i,"
-                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1],
-                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_patch_stack' failed (%s),"
@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows' failed (%s),"
@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

        size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);

-        if (0)
-          fprintf(stderr,
-                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
-                  img_contiguous_2d, kern_contiguous_2d,
-                  threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                  n_blocks[0], n_blocks[1], n_blocks[2]);
-
        GpuKernel *k = NULL;
        if(!img_contiguous_2d || !kern_contiguous_2d) {
-            //fprintf(stderr, "using false version\n");
            k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
        } else {
-            //fprintf(stderr, "using true version\n");
            k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
        }

@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        if (err == GA_NO_ERROR)
        {
            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows_stack' failed (%s),"
@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        if (err == GA_NO_ERROR)
        {
            work_complete = true;
-            if (verbose>1)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_rows_stack2' version %s with"
@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i version=%d\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows_stack2' failed (%s),"
@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

            if (err == GA_NO_ERROR)
            {
-                if (verbose>1)
-                    fprintf(stderr,
-                            "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
-                            "n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
-                            " nb_threads=%i\n",
-                            threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
-                            shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
                if (verbose)
                    fprintf(stderr,
                            "INFO: used 'conv_patch_stack_reduce' version"
@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            }
            else
            {
-                if (verbose)
-                  fprintf(stderr,
-                          "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
-                          " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
-                          " nb_threads=%i\n",
-                          threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                          n_blocks[0], n_blocks[1], shmem_sz,
-                          threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
                if (verbose)
                  fprintf(stderr,
                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        } // else no good nb_splits was found
    }

-    if (1 && (version==6||version==-1) &&
+    if ((version==6||version==-1) &&
        kern_len<=320 &&
        !work_complete) //conv_valid_row_reduce
    {
@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0],
-                      n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
                                        (size_t)256),
                               (size_t)1, (size_t)1};

-        if (1)
-        {
        if (verbose)
            fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose>1)
-              fprintf(stderr, "      img : %i %llu %i %i %p  "
-                      "%lld %lld %lld %lld\n",
-                      nbatch, (unsigned long long)stack_len, img_len, img_wid,
-                      (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
-                      (long long)img_stride_batch,
-                      (long long)img_stride_stack,
-                      (long long)img_stride_row,
-                      (long long)img_stride_col);
-            if (verbose>1)
-              fprintf(stderr, "      kern: %i %i %i %i %p  "
-                      "%lld %lld %lld %lld\n",
-                      nkern, nstack, kern_len, kern_wid,
-                      (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
-                      (long long)kern_stride_nkern,
-                      (long long)kern_stride_stack,
-                      (long long)kern_stride_row,
-                      (long long)kern_stride_col);
-            if (verbose>1)
-                fprintf(stderr, "      out : %llu %llu %i %i %p  "
-                        "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(out)[0],
-                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      out_len, out_wid,
-                      (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
-                      (long long)out_stride_batch,
-                      (long long)out_stride_nkern,
-                      (long long)out_stride_row,
-                      (long long)out_stride_col);
-            if (verbose>1)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks[0], threads_per_block[0]);
-        }

        void *kernel_params[] = {
            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,

        if (err == GA_NO_ERROR)
        {
-          if (verbose>1)
-            fprintf(stderr,
-                    "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
-                    " n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                    n_blocks[0], n_blocks[1], shmem_sz,
-                    threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
-                    out_len, nb_split, version);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_full_patch_stack_padded'"
@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-          if (verbose)
-            fprintf(stderr,
-                    "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
-                    " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
-                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                    n_blocks[0], n_blocks[1], shmem_sz,
-                    threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
-                    out_len, nb_split, version);
          if (verbose)
            fprintf(stderr,
                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
-                      threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_full_patch' failed (%s),"
@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
-                      threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
                      " failed (%s), trying next implementation\n",
@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-            if (verbose)
-              fprintf(stderr,
-                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
-                      " shmem_sz=%i, nb_threads=%i\n",
-                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
-                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
                      GpuKernel_error(k, err));
@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
                                        (size_t)256),
                               (size_t)1, (size_t)1};

-        if (0)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: launching conv_reference_valid\n");
-            if (verbose)
-              fprintf(stderr, "      img : %llu %llu %llu %llu %p  "
-                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)nbatch,
-                      (unsigned long long)stack_len,
-                      (unsigned long long)img_len,
-                      (unsigned long long)img_wid,
-                      (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
-                      (long long)img_stride_batch,
-                      (long long)img_stride_stack,
-                      (long long)img_stride_row,
-                      (long long)img_stride_col);
-            if (verbose)
-              fprintf(stderr, "      kern: %llu %llu %llu %llu %p  "
-                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)nkern,
-                      (unsigned long long)nstack,
-                      (unsigned long long)kern_len,
-                      (unsigned long long)kern_wid,
-                      (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
-                      (long long)kern_stride_nkern,
-                      (long long)kern_stride_stack,
-                      (long long)kern_stride_row,
-                      (long long)kern_stride_col);
-            if (verbose)
-                fprintf(stderr, "      out : %llu %llu %llu %llu %p  "
-                        "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(out)[0],
-                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      (unsigned long long)out_len,
-                      (unsigned long long)out_wid,
-                      (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
-                      (long long)out_stride_batch,
-                      (long long)out_stride_nkern,
-                      (long long)out_stride_row,
-                      (long long)out_stride_col);
-            if (verbose)
-              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks[0], threads_per_block[0]);
-            if (verbose)
-                fprintf(stderr, "   subsample params: %llu %llu\n",
-                        (unsigned long long)subsample_rows,
-                        (unsigned long long)subsample_cols);
-        }
-
        void *kernel_params[] = {
            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
            (void *)&img_len, (void *)&img_wid,
@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        }
        else
        {
-          if (verbose)
-            fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
-                    " n_blocks[0]=%i, n_blocks[1]=%i,"
-                    " shmem_sz=%i, nb_threads=%i\n",
-                    threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
          if (verbose)
            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
                    " trying next implementation\n",
@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,

      rval = pygpu_zeros(4, out_dim,
                         img->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context(), Py_None);
+                         img->context, Py_None);
      //rval might be null
    }
    if ((rval==NULL)
@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
    }
    return (PyObject*)rval;
 }
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
 import copy
 import os

-import theano
-from theano import config, gof
+from theano import gof

 try:
    from pygpu import gpuarray
@@ -10,7 +9,8 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from theano.gof import utils


@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
        them.

    """
+    __props__ = ('border_mode', 'subsample', 'logical_img_hw',
+                 'logical_kern_hw', 'logical_kern_align_top', 'version',
+                 'verbose', 'kshp', 'imshp', 'max_threads_dim0')

    @staticmethod
    def logical_output_shape_2d(imshp, kshp, mode):
@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
        raise ValueError(mode)

-    def __init__(self, border_mode,
-                 subsample=(1, 1),
-                 logical_img_hw=None,
-                 logical_kern_hw=None,
+    def __init__(self, border_mode, subsample=(1, 1),
+                 logical_img_hw=None, logical_kern_hw=None,
                 logical_kern_align_top=True,
-                 version=-1,
-                 direction_hint=None,
-                 verbose=0,
-                 kshp=None,
-                 imshp=None,
+                 version=-1, direction_hint=None,
+                 verbose=0, kshp=None, imshp=None,
                 max_threads_dim0=None,
-                 nkern=None,
-                 bsize=None,
-                 fft_opt=True):
+                 nkern=None, bsize=None, fft_opt=True):
        self.border_mode = border_mode
        self.subsample = subsample
        if logical_img_hw is not None:
@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        self.bsize = bsize
        self.fft_opt = fft_opt

-    def __eq__(self, other):
-        return type(self) == type(other) \
-            and self.border_mode == other.border_mode \
-            and self.subsample == other.subsample \
-            and self.logical_img_hw == other.logical_img_hw \
-            and self.logical_kern_hw == other.logical_kern_hw \
-            and self.logical_kern_align_top == other.logical_kern_align_top \
-            and self.version == other.version \
-            and self.verbose == other.verbose \
-            and self.kshp == other.kshp\
-            and self.imshp == other.imshp\
-            and self.max_threads_dim0 == other.max_threads_dim0
-
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, "imshp"):
@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        if not hasattr(self, "fft_opt"):
            self.fft_opt = True

-    def __hash__(self):
-        # don't use hash(self.version) as hash(-1)==-2 and
-        # hash(-2)==-2 in python!
-        return hash(type(self)) \
-            ^ hash(self.border_mode) \
-            ^ hash(self.subsample) \
-            ^ hash(self.logical_img_hw) \
-            ^ hash(self.logical_kern_hw) \
-            ^ hash(self.logical_kern_align_top) \
-            ^ self.version \
-            ^ hash(self.verbose) \
-            ^ hash(self.kshp)\
-            ^ hash(self.imshp)\
-            ^ hash(self.max_threads_dim0)
-
-    def __str__(self):
-        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            str(self.logical_img_hw),
-            str(self.logical_kern_hw),
-            str(self.logical_kern_align_top),
-            str(self.imshp),
-            str(self.kshp))
-
    def make_node(self, img, kern):
        if img.dtype != "float32" or kern.dtype != "float32":
            raise NotImplementedError("GpuConv currently only work"
@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
            raise TypeError('img must be 4D tensor')
        if kern.type.ndim != 4:
            raise TypeError('kern must be 4D tensor')
-        img = as_gpuarray_variable(img)
-        kern = as_gpuarray_variable(kern)
+        ctx_name = infer_context_name(img, kern)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
-        out = GpuArrayType(img.dtype, broadcastable)()
+        out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
        return gof.Apply(self, [img, kern], [out])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def flops(self, inputs, outputs):
        """
        Useful with the hack in profilemode to print the MFlops.
@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        node_ = copy.copy(node)
        assert node.op is node_.op
-        if config.gpuarray.sync:
-            raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
        if node_.op.max_threads_dim0 is None:
-            cuda = theano.sandbox.cuda
-            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+            node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
                                                   compute_map, no_recycling)

@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 22)
+        return (0, 23)

    def c_code(self, node, nodename, inp, out_, sub):
+        if node.inputs[0].type.context.kind != "cuda":
+            raise NotImplementedError("GpuConv only works for cuda devices")
        img, kern = inp
        out, = out_
        dx = self.subsample[0]
@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        """ % locals()
        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
-        kname = "conv_full_load_everything"
        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
        bin = gk._binary
        bcode = ','.join(hex(ord(c)) for c in bin)
@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
        static const char conv_bcode[] = {%(bcode)s};
        static const char *conv_code = "%(code)s";
        """ % locals()
-        for k in kernels:
-            mod += "static GpuKernel " + k.name + '_' + name + ";\n"
-        mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
+        return mod
+
+    def c_support_code_struct(self, node, name):
+        mod = GpuKernelBase.c_support_code_struct(self, node, name)
+        with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
+            mod += f.read()
        return mod

    @utils.memoize

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {

 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
 //bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers

 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
      if (thread_id < nb_thread)
        {
          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
+                  ((size_t)src) & COALESCED_ALIGN);
          my_src_ptr += thread_id;
          while (my_src_ptr < src + N)
          {

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -15,8 +15,9 @@ from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)

-from . import pygpu, init_dev
-from .basic_ops import (as_gpuarray_variable,
+from . import pygpu
+from .type import get_context, gpu_context_type, list_contexts
+from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, HostFromGpu,
                        GpuAllocEmpty, empty_like)
 from .elemwise import GpuElemwise
@@ -29,28 +30,14 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
 from .opt_util import alpha_merge, output_merge, inplace_allocempty


-def dnn_available():
-    if dnn_available.avail is not None:
-        return dnn_available.avail
-    if pygpu is None:
-        dnn_available.msg = "PyGPU not available"
-        dnn_available.avail = False
-        return False
-    if not init_dev.device.startswith('cuda'):
-        dnn_available.msg = "Not on a CUDA device. Got %s." % init_dev.device
-        dnn_available.avail = False
-        return False
-    # This is a hack because bin_id is in the from of
-    # "sm_<major><minor>" for cuda devices.
-    if pygpu.get_default_context().bin_id[:-2] < '30':
-        dnn_available.msg = "Device not supported by cuDNN"
-        dnn_available.avail = False
+def _dnn_check_compile():
    preambule = """
 #include <stdio.h>
 #include <cudnn.h>
 #include <cudnn_helper.h>
 """

+    # No need for the context in here since we won't execute that code
    body = """
 cudnnHandle_t _handle = NULL;
 cudnnStatus_t err;
@@ -70,35 +57,71 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
    # default gpu, not the one selected by the user. If mixed
    # GPU are installed or if the GPUs are configured in
    # exclusive mode, this cause bad detection.
-    comp, out, err = GCC_compiler.try_flags(
+    avail, out, err = GCC_compiler.try_flags(
        params, preambule=preambule, body=body,
        try_run=False, output=True)

-    dnn_available.avail = comp
-    if not dnn_available.avail:
-        dnn_available.msg = (
-            "Theano cannot compile with cuDNN. We got this error:\n" +
-            str(err))
-    else:
-        # If we can compile, check that we can import and run.
+    if not avail:
+        return False, ("Theano cannot compile with cuDNN. "
+                       "We got this error:\n" + str(err))
+    return True, None
+
+
+def _dnn_check_version():
    v = version()
    if v < 2000:
-            dnn_available.avail = False
-            dnn_available.msg = (
+        return False, (
            "You have an old release of CuDNN (or a release candidate) "
            "that isn't supported.  Please update to at least v2 final "
            "version.")
-            raise RuntimeError(dnn_available.msg)
    if v >= 3000 and v < 3007:
-            dnn_available.avail = False
-            dnn_available.msg = (
+        return False, (
            "You have installed a release candidate of CuDNN v3. This "
            "isn't supported. Please update to v3 final version.")
-            raise RuntimeError(dnn_available.msg)

-    return dnn_available.avail
+    return True, None
+
+
+def dnn_present():
+    if dnn_present.avail is not None:
+        return dnn_present.avail
+
+    if pygpu is None:
+        dnn_present.msg = "PyGPU not available"
+        dnn_present.avail = False
+        return False
+
+    dnn_present.avail, dnn_present.msg = _dnn_check_compile()
+    if dnn_present.avail:
+        dnn_present.avail, dnn_present.msg = _dnn_check_version()
+        if not dnn_present.avail:
+            raise RuntimeError(dnn_present.msg)
+
+    return dnn_present.avail
+
+dnn_present.avail = None
+dnn_present.msg = None
+
+
+def dnn_available(context_name):
+    if not dnn_present():
+        dnn_available.msg = dnn_present.msg
+        return False
+
+    ctx = get_context(context_name)
+
+    if not ctx.kind == 'cuda':
+        dnn_available.msg = "Not on a CUDA device."
+        return False
+
+    # This is a hack because bin_id is in the from of
+    # "<something>_<major><minor>" for cuda devices.
+    if ctx.bin_id[:-2] < '30':
+        dnn_available.msg = "Device not supported by cuDNN"
+        return False
+
+    return True

-dnn_available.avail = None
 dnn_available.msg = None


@@ -110,6 +133,10 @@ class DnnBase(COp):
    # dnn does not know about broadcasting, so we do not need to assert
    # the input broadcasting pattern.
    check_broadcast = False
+    context_type = gpu_context_type
+
+    def get_context(self, node):
+        return node.outputs[0].type.context

    def __init__(self, files=None, c_func=None):
        if files is None:
@@ -181,7 +208,7 @@ def version():

    This also does a check that the header version matches the runtime version.
    """
-    if not dnn_available():
+    if not dnn_present():
        raise Exception(
            "We can't determine the cudnn version as it is not available",
            dnn_available.msg)
@@ -390,9 +417,10 @@ class GpuDnnConv(DnnBase):
        return defs

    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
-        img = as_gpuarray_variable(img)
-        kern = as_gpuarray_variable(kern)
-        output = as_gpuarray_variable(output)
+        ctx_name = infer_context_name(img, kern, output)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        output = as_gpuarray_variable(output, ctx_name)
        if img.type.ndim not in (4, 5):
            raise TypeError('img must be 4D or 5D tensor')
        if kern.type.ndim not in (4, 5):
@@ -574,9 +602,10 @@ class GpuDnnConvGradW(DnnBase):
        return defs

    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
-        img = as_gpuarray_variable(img)
-        topgrad = as_gpuarray_variable(topgrad)
-        output = as_gpuarray_variable(output)
+        ctx_name = infer_context_name(img, topgrad, output)
+        img = as_gpuarray_variable(img, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        output = as_gpuarray_variable(output, ctx_name)
        if img.type.ndim not in (4, 5):
            raise TypeError('img must be 4D or 5D tensor')
        if topgrad.type.ndim not in (4, 5):
@@ -689,9 +718,10 @@ class GpuDnnConvGradI(DnnBase):
        return defs

    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
-        kern = as_gpuarray_variable(kern)
-        topgrad = as_gpuarray_variable(topgrad)
-        output = as_gpuarray_variable(output)
+        ctx_name = infer_context_name(kern, topgrad, output)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        output = as_gpuarray_variable(output, ctx_name)
        if kern.type.ndim not in (4, 5):
            raise TypeError('kern must be 4D or 5D tensor')
        if topgrad.type.ndim not in (4, 5):
@@ -770,6 +800,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        warnings.warn("workmem is deprecated, use algo instead", stacklevel=2)
        algo = workmem
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    ctx_name = infer_context_name(img, kerns)
    if (border_mode == 'valid' and subsample == (1, 1) and
            direction_hint == 'bprop weights'):
        # Special case: We are asked to use GpuDnnConvGradW. We need to set
@@ -782,12 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph),
+        out = GpuAllocEmpty(img.dtype, ctx_name)(
+            shape_i(kerns, 1, fgraph),
            shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross')(out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
-        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3))
+        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)

    elif (border_mode == 'full' and subsample == (1, 1) and
          direction_hint != 'forward!'):
@@ -799,7 +831,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = GpuAllocEmpty(img.dtype)(shape_i(img, 0, fgraph),
+        out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
                                                 shape_i(kerns, 1, fgraph),
                                                 shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
@@ -817,7 +849,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
-    out = GpuAllocEmpty(img.dtype)(*out_shp)
+    out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


@@ -948,7 +980,7 @@ class GpuDnnPool(DnnBase):
        DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")

    def make_node(self, img, desc):
-        img = as_gpuarray_variable(img)
+        img = as_gpuarray_variable(img, infer_context_name(img))

        if desc.owner is not None:
            e_ndim = desc.owner.op.get_ndim() + 2
@@ -1002,7 +1034,7 @@ class GpuDnnPoolGrad(DnnBase):
        The input of the pooling.
    out
        The output of the pooling in the forward.
-    inp_grad
+    out_grad
        Same size as out, but is the corresponding gradient information.
    desc
        The pooling descriptor.
@@ -1016,9 +1048,10 @@ class GpuDnnPoolGrad(DnnBase):
                         "APPLY_SPECIFIC(dnn_pool_grad)")

    def make_node(self, inp, out, out_grad, desc):
-        inp = as_gpuarray_variable(inp)
-        out_grad = as_gpuarray_variable(out_grad)
-        out = as_gpuarray_variable(out)
+        ctx_name = infer_context_name(inp, out, out_grad)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        out = as_gpuarray_variable(out, ctx_name)

        if desc.owner is not None:
            nd = desc.owner.op.get_ndim() + 2
@@ -1147,7 +1180,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    c_func = "APPLY_SPECIFIC(softmax)"

    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
        assert x.ndim == 4
        return Apply(self, [x], [x.type()])

@@ -1181,8 +1214,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    c_func = "APPLY_SPECIFIC(softmax_grad)"

    def make_node(self, dy, sm):
-        dy = as_gpuarray_variable(dy)
-        sm = as_gpuarray_variable(sm)
+        ctx_name = infer_context_name(dy, sm)
+        dy = as_gpuarray_variable(dy, ctx_name)
+        sm = as_gpuarray_variable(sm, ctx_name)
        assert dy.ndim == 4
        assert sm.ndim == 4
        return Apply(self, [dy, sm], [sm.type()])
@@ -1191,9 +1225,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
 # @register_opt('cudnn')  # this optimizer is registered in opt.py instead.
 @local_optimizer([GpuConv])
 def local_conv_dnn(node):
-    if not dnn_available():
-        return
    if isinstance(node.op, GpuConv):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
        if node.op.border_mode not in ['full', 'valid']:
            return
        img, kern = node.inputs
@@ -1211,9 +1245,9 @@ def local_conv_dnn(node):
 # because for some input/kernel shape configurations, this is faster.
 @local_optimizer([GpuConv])
 def local_conv_dnn_alternative(node):
-    if not dnn_available():
-        return
    if isinstance(node.op, GpuConv):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
        border_mode = node.op.border_mode
        subsample = node.op.subsample
        if border_mode not in ['full', 'valid'] or subsample != (1, 1):
@@ -1304,8 +1338,8 @@ def local_dnn_convi_output_merge(node, *inputs):

 @register_opt('cudnn')
 @op_lifter([DownsampleFactorMax])
-def local_pool_dnn_alternative(node):
-    if not dnn_available():
+def local_pool_dnn_alternative(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    if not node.op.ignore_border:
        return
@@ -1320,8 +1354,8 @@ def local_pool_dnn_alternative(node):

 @register_opt('cudnn')
 @op_lifter([MaxPoolGrad])
-def local_pool_dnn_grad_stride(node):
-    if not dnn_available():
+def local_pool_dnn_grad_stride(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    if not node.op.ignore_border:
        return
@@ -1340,8 +1374,8 @@ def local_pool_dnn_grad_stride(node):

 @register_opt('cudnn')
 @op_lifter([AveragePoolGrad])
-def local_avg_pool_dnn_grad_stride(node):
-    if not dnn_available():
+def local_avg_pool_dnn_grad_stride(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    if not node.op.ignore_border:
        return
@@ -1363,22 +1397,23 @@ def local_avg_pool_dnn_grad_stride(node):
 @register_opt('cudnn')
 @local_optimizer([GpuSoftmax])
 def local_softmax_dnn(node):
-    if not dnn_available():
-        return
    if isinstance(node.op, GpuSoftmax):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax('accurate', 'channel')(ins)
-        out = as_gpuarray_variable(out.dimshuffle(0, 1))
+        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
        return [out]


 @register_opt('cudnn')
 @local_optimizer([GpuElemwise])
 def local_log_softmax_dnn(node):
-    if not dnn_available() or version() < 3000:
+    if version() < 3000:
        # No log-softmax before cudnn v3
        return
+    # This looks for GpuDnnSoftmax so we know that we have cudnn.
    if (isinstance(node.op, GpuElemwise) and
            isinstance(node.op.scalar_op, Log) and
            node.inputs[0].owner and
@@ -1392,15 +1427,16 @@ def local_log_softmax_dnn(node):
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
        """
-        Raise a RuntimeError if cudnn can't be used.
+        Raise a error if cudnn can't be used.

        """
-        if not dnn_available():
+        for c in list_contexts():
+            if not dnn_available(c):
                # Make an assert error as we want Theano to fail, not
                # just skip this optimization.
                raise AssertionError(
-                "cuDNN optimization was enabled, but Theano was not able"
-                " to use it. We got this error: \n" +
+                    "cuDNN optimization was enabled, but Theano was not able "
+                    "to use it for context " + c + ". We got this error: \n" +
                    dnn_available.msg)

 gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@@ -1408,8 +1444,8 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')

 @register_opt('cudnn')
 @op_lifter([SoftmaxGrad])
-def local_softmax_dnn_grad(node):
-    if not dnn_available():
+def local_softmax_dnn_grad(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    ins = []
    for n in node.inputs:

--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
 #section init_code_struct

 {
-  cuda_enter(pygpu_default_context()->ctx);
+  cuda_enter(CONTEXT->ctx);
  cudnnStatus_t err;
  APPLY_SPECIFIC(_handle) = NULL;
  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
                 cudnnGetErrorString(err));
-    cuda_exit(pygpu_default_context()->ctx);
+    cuda_exit(CONTEXT->ctx);
    FAIL;
  }
-  cuda_exit(pygpu_default_context()->ctx);
+  cuda_exit(CONTEXT->ctx);
 }
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
-                         PyGpuArrayObject **output) {
+                         PyGpuArrayObject **output,
+                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **input) {
+                        double alpha, double beta, PyGpuArrayObject **input,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **kerns) {
+                        double alpha, double beta, PyGpuArrayObject **kerns,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI

 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             cudnnPoolingDescriptor_t desc,
-                             PyGpuArrayObject **out) {
+                             PyGpuArrayObject **out,
+                             PyGpuContextObject *c) {
  cudnnStatus_t err;
  size_t dims[5];
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
                                  cudnnPoolingDescriptor_t desc,
-                                  PyGpuArrayObject **inp_grad) {
+                                  PyGpuArrayObject **inp_grad,
+                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,

  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
                         PyGpuArray_DIMS(inp), inp->ga.typecode,
-                         GA_C_ORDER, pygpu_default_context()) != 0) {
+                         GA_C_ORDER, c) != 0) {
    return 1;
  }


--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
 #section support_code_struct

 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
-                            PyGpuArrayObject **out) {
+                            PyGpuArrayObject **out,
+                            PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)

 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
-                                 PyGpuArrayObject **dx) {
+                                 PyGpuArrayObject **dx,
+                                 PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -20,8 +20,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import (as_gpuarray_variable, HideC,
-                        GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .type import GpuArrayType
 from .fp16_help import load_w, write_w

@@ -37,7 +37,7 @@ def make_argument(v, name):
        return ArrayArg(numpy.dtype(v.type.dtype), name)


-def ensure_allocated(storage, shape, dtype):
+def ensure_allocated(storage, shape, dtype, ctx):
    odat = storage[0]
    if odat is not None:
        if odat.shape != shape:
@@ -45,7 +45,7 @@ def ensure_allocated(storage, shape, dtype):
            # we have to allocate output storage.
            odat = None
    if odat is None:
-        odat = pygpu.empty(shape, dtype=dtype)
+        odat = pygpu.empty(shape, dtype=dtype, context=ctx)
    storage[0] = odat
    return odat

@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)

    def make_node(self, *inputs):
+        ctx_name = infer_context_name(*inputs)
        res = Elemwise.make_node(self, *inputs)
        outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
+                                context_name=ctx_name,
                                dtype=o.type.dtype)() for o in res.outputs]
        if len(outputs) > 1:
            raise NotImplementedError()
-        inputs = [as_gpuarray_variable(i) for i in inputs]
+        inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
        node = Apply(self, inputs, outputs)

        # Try to generate the kernel to catch SupportCodeErrors
@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):

        return node

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def generate_kernel(self, node, nodename):
        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
                enumerate(node.inputs)]
@@ -168,7 +173,8 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                        ("npy_float64", "ga_double"),
                        ]:
            kop = kop.replace(npy, ga)
-        return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
+        return ElemwiseKernel(self.get_context(node), inps + outs, kop,
+                              preamble=support_code)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -177,8 +183,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return self.scalar_op.c_support_code()

    def _gpu_kernel_code(self, node, nodename):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
        # This is useless by itself, but will serve an eventual c_code
        # implementation
        k = self.generate_kernel(node, nodename)
@@ -191,8 +195,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return '\n'.join(res)

    def gpu_kernels(self, node, nodename):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
        src = self._gpu_kernel_code(node, nodename)
        nd = node.outputs[0].ndim
        params = ['uintp']
@@ -214,12 +216,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                       objvar='elem_%d_%s' % (nd, nodename))]

    def c_code(self, node, name, inputs, outputs, sub):
-        if pygpu.get_default_context().kind == 'opencl':
+        if node.inputs[0].type.context.kind != 'cuda':
            raise MethodNotDefined('cuda only')
        nd = node.outputs[0].ndim
        fail = sub["fail"]
        initial_dims = ','.join('1' for i in xrange(nd))
        opname = str(self.scalar_op)
+        ctx = sub['context']

        # check that all inputs have valid dimensions
        emitted_inames = {}
@@ -264,11 +267,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            if iname in emitted_inames:
                continue
            code += """
-        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
        {
            PyErr_Format(PyExc_TypeError,
-                         "need %(nd)s dims, not %%i",
+                         "need %(nd)s dims, not %%u",
                         PyGpuArray_NDIM(%(iname)s));
            %(fail)s;
        }
@@ -279,14 +281,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
            {
-                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
-                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
-                             ", but the output's size on that axis is %%i.",
+                             " %(idx)d (indices start at 0) has shape[%%d] == %%llu"
+                             ", but the output's size on that axis is %%llu.",
                             i,
-                             PyGpuArray_DIMS(%(iname)s)[i],
-                             dims[i]
+                             (unsigned long long)PyGpuArray_DIMS(%(iname)s)[i],
+                             (unsigned long long)dims[i]
                            );
                %(fail)s;
            }
@@ -314,15 +315,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        {
            %(oname)s = pygpu_empty(%(nd)d, dims,
                            %(typecode)s, GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
+                            %(ctx)s, Py_None);
            if (!%(oname)s) {
-                        //TODO, this check don't seam good.
-                        //TODO, set exception?
                %(fail)s
            }
        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
@@ -337,19 +334,17 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
-                             " on input %(input_idx)s, has shape[%%i] == %%i"
-                             ", but the output's size on that axis is %%i.",
+                             " on input %(input_idx)s, has shape[%%i] == %%llu"
+                             ", but the output's size on that axis is %%llu.",
                             i,
-                             PyGpuArray_DIMS(%(oname)s)[i],
-                             dims[i]
+                             (unsigned long long)PyGpuArray_DIMS(%(oname)s)[i],
+                             (unsigned long long)dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
        z = outputs[0]
        code += """numEls = PyGpuArray_SIZE(%(z)s);
@@ -367,7 +362,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        if (threads_per_block * n_blocks < numEls)
            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);

-                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

        kname = 'elem_%d_%s' % (nd, name)
@@ -407,7 +401,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            """ % locals()
        return str(code)

-    def perform(self, node, inputs, output_storage):
+    def perform(self, node, inputs, output_storage, ctx):
        # Try to reuse the kernel from a previous call to hopefully
        # avoid recompiling
        if not hasattr(node, '_cache_elemwise_k'):
@@ -428,7 +422,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            if n in self.inplace_pattern:
                stor[0] = inputs[self.inplace_pattern[n]]
            else:
-                args.append(ensure_allocated(stor, out_shape, out.type.dtype))
+                args.append(ensure_allocated(stor, out_shape, out.type.dtype, ctx))

        node._cache_elemwise_k(*args, broadcast=True)
        if config.gpuarray.sync:
@@ -453,10 +447,12 @@ class GpuDimShuffle(HideC, DimShuffle):
    _f16_ok = True

    def make_node(self, input):
+        ctx_name = infer_context_name(input)
        res = DimShuffle.make_node(self, input)
        otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
-                             broadcastable=res.outputs[0].type.broadcastable)
-        input = as_gpuarray_variable(input)
+                             broadcastable=res.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        input = as_gpuarray_variable(input, ctx_name)
        return Apply(self, [input], [otype()])

    def __str__(self):
@@ -588,7 +584,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    quite possible that the GPU might be slower for some cases.

    """
-
+    __props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
+                 'pre_scalar_op')
    _f16_ok = True

    def __init__(self, scalar_op, axis=None,
@@ -607,24 +604,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if pre_scalar_op:
            assert pre_scalar_op.nin == 1

-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.axis == other.axis and
-                self.reduce_mask == other.reduce_mask and
-                self.dtype == other.dtype and
-                self.acc_dtype == other.acc_dtype and
-                self.scalar_op == other.scalar_op and
-                self.pre_scalar_op == other.pre_scalar_op)
-
-    def __hash__(self):
-        return (hash(type(self)) ^
-                hash(self.axis) ^
-                hash(self.reduce_mask) ^
-                hash(self.dtype) ^
-                hash(self.acc_dtype) ^
-                hash(type(self.scalar_op)) ^
-                hash(type(self.pre_scalar_op)))
-
    def __str__(self):
        pre = ""
        if self.pre_scalar_op:
@@ -641,7 +620,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            self.pre_scalar_op = None

    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
+        if x.type.context.kind != 'cuda':
+            raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
        ret = super(GpuCAReduceCuda, self).make_node(x)
        self = copy.copy(self)
        self.axis = ret.op.axis
@@ -666,9 +647,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                "complex" in self._acc_dtype(x.dtype)):
            raise NotImplementedError("We don't support complex in gpu reduction")
        return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
-                                              ret.outputs[0].type.broadcastable)()])
+                                              ret.outputs[0].type.broadcastable,
+                                              context_name=x.type.context_name)()])

-    def perform(self, node, inp, out):
+    def get_context(self, node):
+        return node.inputs[0].type.context
+
+    def perform(self, node, inp, out, ctx):
        raise MethodNotDefined("")

    def supports_c_code(self, inputs):
@@ -698,7 +683,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
        out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]

-        sub = {'fail': 'fake failure code'}
+        sub = {'fail': 'fake failure code', 'context': 'fake context'}

        try:
            self.c_code(node, name, inp, out, sub)
@@ -733,7 +718,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s)
        {
            PyErr_Format(PyExc_TypeError,
-                         "required nd=%(nd_in)s, got nd=%%i", PyGpuArray_NDIM(%(x)s));
+                         "required nd=%(nd_in)s, got nd=%%u", PyGpuArray_NDIM(%(x)s));
            %(fail)s;
        }
        """ % locals(), file=sio)
@@ -791,7 +776,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
                                %(out_typecode)s, GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(ctx)s, Py_None);
            if (NULL == %(z)s)
            {
                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
@@ -1338,8 +1323,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                     (void *)%(z)s->ga.data,
                                     (void *)&%(z)s->ga.offset};
            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
-                                " n_threads=%%lu, size=%%lu, ndim=%%d\\n",
-                                n_threads,numEls,
+                                " n_threads=%%llu, size=%%llu, ndim=%%u\\n",
+                                n_threads, numEls,
                                PyGpuArray_NDIM(%(x)s));
            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
@@ -1521,9 +1506,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
            if (verbose) {
              fprintf(stderr,
-                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
-                n_blocks[0],
-                n_blocks[1]);
+                "running kernel_reduce_10_%(name)s n_blocks=(%%llu,%%llu)\\n",
+                (unsigned long long)n_blocks[0],
+                (unsigned long long)n_blocks[1]);
            }
            assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
@@ -1911,12 +1896,17 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [17]  # the version corresponding to the c code in this Op
+        version = [18]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
-        version.extend(self.scalar_op.c_code_cache_version())
+        scalar_node = Apply(
+            self.scalar_op,
+            [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+            [Scalar(dtype=output.type.dtype)() for output in node.outputs])
+        version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
        for i in node.inputs + node.outputs:
            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+        version.extend(self.kernel_version(node))
        if all(version):
            return tuple(version)
        else:
@@ -2644,7 +2634,6 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    Too slow for now as it only have a python interface.

    """
-
    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
        if not hasattr(scalar_op, 'identity'):
            raise ValueError("No identity on scalar op")
@@ -2658,10 +2647,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        return "GpuReduce{%s}%s" % (self.scalar_op, ax)

    def make_node(self, input):
+        ctx_name = infer_context_name(input)
        res = CAReduceDtype.make_node(self, input)
-        input = as_gpuarray_variable(input)
+        input = as_gpuarray_variable(input, ctx_name)
        otype = GpuArrayType(dtype=res.outputs[0].dtype,
-                             broadcastable=res.outputs[0].broadcastable)
+                             broadcastable=res.outputs[0].broadcastable,
+                             context_name=ctx_name)

        if res.op.axis is not None:
            redux = []
@@ -2673,11 +2664,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):

        return Apply(res.op, [input], [otype()])

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        # cache the kernel object
        self.get_kernel_cache(node)
-        return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
-                                                      compute_map, no_recycling)
+        return super(GpuCAReduceCPY, self).make_thunk(
+            node, storage_map, compute_map, no_recycling)

    def get_kernel_cache(self, node):
        attr = '@cache_reduction_k'
@@ -2776,33 +2770,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                    j += 1
            code += """
         if (need_out) {
-             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
+             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None);
             if (!%(output)s) {
                 %(fail)s
             }
         }
        """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
+                   ctx=sub['context'],
                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        else:
            code += """
        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
            Py_XDECREF(%(output)s);
            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
-                                     pygpu_default_context(), Py_None);
+                                     %(ctx)s, Py_None);
            if (!%(output)s) {
                %(fail)s
            }
        }
-        """ % dict(output=output, fail=sub['fail'],
+        """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))

        if acc_dtype != node.outputs[0].type.dtype:
            code += """
        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
-                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
-                          Py_None);
+                          %(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None);
        if (!tmp) %(fail)s
-        """ % dict(output=output, fail=sub['fail'],
+        """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
                   acc_type=dtype_to_typecode(acc_dtype))
        else:
            code += """
@@ -2893,12 +2887,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            reduce_expr = "a * b"
        else:
            raise NotImplementedError()
-        return ReductionKernel(pygpu.get_default_context(), odtype,
+        return ReductionKernel(node.inputs[0].type.context, odtype,
                               self.scalar_op.identity, reduce_expr, redux,
                               arguments=[make_argument(node.inputs[0], 'a')],
                               init_nd=node.inputs[0].ndim)

-    def perform(self, node, inp, out):
+    def perform(self, node, inp, out, ctx):
        input, = inp
        output, = out

@@ -2912,6 +2906,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                copy=False, dtype=node.outputs[0].type.dtype)
        else:
            output[0] = pygpu.gpuarray.array(input, copy=True,
-                                             dtype=node.outputs[0].type.dtype)
+                                             dtype=node.outputs[0].type.dtype,
+                                             context=ctx)
 # To allow reloading old pickled files
 GpuCAReduce = GpuCAReduceCPY
--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
@@ -2,7 +2,7 @@

 /* Why do we need this? */
 size_t dim = 2048 * 32;
-rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
+rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
                       Py_None);
 if (rand_buf == NULL) {
  FAIL;
@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;

 int gemm16(PyGpuArrayObject *C, float alpha,
           PyGpuArrayObject *A, PyGpuArrayObject *B,
-           float beta, PyGpuArrayObject **out) {
+           float beta, PyGpuArrayObject **out,
+           PyGpuContextObject *c) {
  PyGpuArrayObject *_A = NULL;
  PyGpuArrayObject *_B = NULL;
  GpuKernel *gk;

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
@@ -10,7 +10,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType

@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        self.mode = mode

    def make_node(self, ten4, neib_shape, neib_step):
-        ten4 = as_gpuarray_variable(ten4)
+        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
        neib_shape = T.as_tensor_variable(neib_shape)
        neib_step = T.as_tensor_variable(neib_step)

@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):

        return Apply(self, [ten4, neib_shape, neib_step],
                     [GpuArrayType(broadcastable=(False, False),
-                                   dtype=ten4.type.dtype)()])
+                                   dtype=ten4.type.dtype,
+                                   context_name=ten4.type.context_name)()])
+
+    def get_context(self, node):
+        return node.inputs[0].type.context

    def c_code_cache_version(self):
        return (11,)
@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
-//a version that use less register but don't work in all case.
+// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        return kernels

    def c_code(self, node, name, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        dtype_ten4 = node.inputs[0].dtype
        dtype_neib_shape = node.inputs[1].dtype
        dtype_neib_step = node.inputs[2].dtype
@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        ten4, neib_shape, neib_step = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        mode = self.mode
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                dims[0] = z_dim0;
                dims[1] = z_dim1;
                %(z)s = pygpu_empty(2, dims, %(typecode_z)s,
-                                    GA_C_ORDER, pygpu_default_context(),
-                                    Py_None);
+                                    GA_C_ORDER, %(ctx)s, Py_None);
                if (!%(z)s)
                {
                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):


 @op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node):
+def use_gpu_images2neibs(node, context_name):
    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
        return GpuImages2Neibs(node.op.mode)


--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
 from theano.scalar import as_scalar, constant

 from . import opt
-from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
+from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
+                        infer_context_name)
+from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge
-from .pycuda_helper import ensure_pycuda_context
-

 try:
    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
@@ -43,6 +43,7 @@ def ensure_float(val, name):
 class Gemm16(COp):
    __props__ = ('relu', 'inplace')
    _f16_ok = True
+    context_type = gpu_context_type
    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
                  'tn_128x128', 'tn_128x64', 'tn_128x32',
@@ -61,10 +62,11 @@ class Gemm16(COp):
    def make_node(self, C, alpha, A, B, beta):
        if GPUTensor is None:
            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
+        ctx_name = infer_context_name(C, A, B)

-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)

        alpha = ensure_float(alpha, 'alpha')
        beta = ensure_float(beta, 'beta')
@@ -73,27 +75,8 @@ class Gemm16(COp):

        return Apply(self, [C, alpha, A, B, beta], [C.type()])

-    def perform(self, node, inputs, outputs):
-        ensure_pycuda_context()
-        C, alpha, A, B, beta = inputs
-        # The nervana code does not support the case where both inputs
-        # are trans, so we need to copy one if them if that is the
-        # case. We copy the smaller one.
-        if A.flags.f_contiguous and B.flags.f_contiguous:
-            if A.size < B.size:
-                A = A.copy()
-            else:
-                B = B.copy()
-        inplace = self.inplace
-        if inplace and not C.flags.c_contiguous:
-            inplace = False
-        if not inplace:
-            C = C.copy()
-        At = to_gputensor(A)
-        Bt = to_gputensor(B)
-        Ct = to_gputensor(C)
-        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
-        outputs[0][0] = C
+    def get_context(self, node):
+        return node.inputs[0].type.context

    def c_headers(self):
        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
        codel.append("const char *bcode;")
        codel.append("size_t sz;")
-        codel.append("PyGpuContextObject *c = pygpu_default_context();")
+        codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,

 @opt.register_opt()
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node):
+def local_dot_to_gemm16(node, ctx_name):
    if nerv is None:
        return
    A = node.inputs[0]
@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = node.inputs[0].fgraph
-        C = GpuAllocEmpty(dtype='float16')(
+        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -10,7 +10,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
                             inline_softmax,
@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.

    """
-
    nin = 3
    nout = 3
    __props__ = ()
    _f16_ok = True

    def make_node(self, x, b, y_idx):
-        # N.B. won't work when we don't cast y_idx to float anymore
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
-        y_idx = as_gpuarray_variable(y_idx)
+        ctx_name = infer_context_name(x, b, y_idx)
+        x = as_gpuarray_variable(x, ctx_name)
+        b = as_gpuarray_variable(b, ctx_name)
+        y_idx = as_gpuarray_variable(y_idx, ctx_name)
        nll = GpuArrayType(x.type.dtype,
-                           y_idx.type.broadcastable)()
+                           y_idx.type.broadcastable,
+                           context_name=ctx_name)()
        sm = x.type()
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                       flags=flags, objvar=k_var)]

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError('cuda only')
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
+        ctx = sub['context']
        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_x)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_x)s, GA_C_ORDER, %(ctx)s,
+                                Py_None);
            if (!%(nll)s) {
                %(fail)s
            }
@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode_b)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_b)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(am)s);
            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_y_idx)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_y_idx)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
    Gradient wrt x of the CrossentropySoftmax1Hot Op.

    """
-
    nin = 3
    nout = 1
    __props__ = ()
    _f16_ok = True

    def make_node(self, dnll, sm, y_idx):
-        dnll = as_gpuarray_variable(dnll)
-        sm = as_gpuarray_variable(sm)
-        y_idx = as_gpuarray_variable(y_idx)
+        ctx_name = infer_context_name(dnll, sm, y_idx)
+        dnll = as_gpuarray_variable(dnll, ctx_name)
+        sm = as_gpuarray_variable(sm, ctx_name)
+        y_idx = as_gpuarray_variable(y_idx, ctx_name)
        return Apply(self, [dnll, sm, y_idx], [sm.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def c_code_cache_version(self):
        return (11,)

@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
+        ctx = sub['context']
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
-                                 %(typecode_dx)s,
-                                 GA_C_ORDER,
-                                 pygpu_default_context(), Py_None);
+                                 %(typecode_dx)s, GA_C_ORDER,
+                                 %(ctx)s, Py_None);
            if (!%(dx)s) {
                %(fail)s
            }
@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
    Implement Softmax on the gpu.

    """
-
    __props__ = ()
    _f16_ok = True

    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
        return Apply(self, [x], [x.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def infer_shape(self, node, shape):
        return shape

@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        dtype_x = node.inputs[0].dtype
        work_x = work_dtype(dtype_x)
        dtype_z = node.outputs[0].dtype
@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        x, = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }
@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
 gpu_softmax = GpuSoftmax()


-class GpuSoftmaxWithBias (GpuKernelBase, Op):
+class GpuSoftmaxWithBias(GpuKernelBase, Op):
    """
    Implement SoftmaxWithBias on the gpu.

    """
-
    nin = 2
    nout = 1
    __props__ = ()
    _f16_ok = True

    def make_node(self, x, b):
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
+        ctx_name = infer_context_name(x, b)
+        x = as_gpuarray_variable(x, ctx_name)
+        b = as_gpuarray_variable(b, ctx_name)
        return Apply(self, [x, b], [x.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def infer_shape(self, node, shape):
        return [shape[0]]

@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError('cuda only')
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_z = node.outputs[0].dtype
@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        x, b = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -3,11 +3,6 @@ import numpy
 import logging
 from six.moves import xrange

-try:
-    import pygpu
-except ImportError:
-    pass
-
 import theano
 from theano import tensor, scalar, gof
 from theano.compile import optdb
@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.nnet.conv import ConvOp
 from theano.tests.breakpoint import PdbBreakpoint

-from .type import GpuArrayType, GpuArrayConstant
-from .basic_ops import (as_gpuarray_variable,
-                        host_from_gpu, gpu_from_host,
+from .type import GpuArrayType, GpuArrayConstant, get_context
+from .basic_ops import (as_gpuarray_variable, infer_context_name,
+                        host_from_gpu, GpuToGpu,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous,
-                        gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
+                        GpuAlloc, GpuAllocEmpty, GpuReshape,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
                   gpugemm_no_inplace)
@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
                       'unsafe')


-def safe_to_gpu(x):
+def safe_to_gpu(x, ctx_name):
    if isinstance(x.type, tensor.TensorType):
-        return gpu_from_host(x)
+        return GpuFromHost(ctx_name)(x)
    else:
        return x

@@ -102,24 +97,49 @@ def op_lifter(OP, cuda_only=False):
    """
    def f(maker):
        def local_opt(node):
-            dev = theano.sandbox.gpuarray.init_dev.device
-            if cuda_only and not dev.startswith('cuda'):
-                return
-
            if type(node.op) in OP:
-
                # Either one of our inputs is on the gpu or
-                # all of our client are on the gpu
-                if (any([i.owner and i.owner.op == host_from_gpu
-                         for i in node.inputs]) or
-                    all([c != 'output' and c.op == gpu_from_host
-                         for c, idx in node.outputs[0].clients])):
-                    new_op = maker(node)
-                    # This is needed as sometimes new_op inherit from OP.
+                # all of our clients are on the gpu
+                replace = False
+                # TODO: Maybe set context_name with infer_context_name()?
+                context_name = None
+                # We replace if any input is a host_from_gpu
+                for i in node.inputs:
+                    if i.owner and i.owner.op == host_from_gpu:
+                        context_name = i.owner.inputs[0].type.context_name
+                        replace = True
+                        break
+                if not replace:
+                    # We replace if *all* clients are on the GPU
+                    clients = [c for o in node.outputs for c in o.clients]
+                    replace = len(clients) != 0
+                    for c, idx in clients:
+                        if (c == 'output' or
+                                not isinstance(c.op, GpuFromHost)):
+                            replace = False
+                    # TODO: check that the clients want the same context?
+                    if replace:
+                        # All clients are GpuFromHost and we have at least one
+                        context_name = clients[0][0].op.context_name
+
+                # Check if we should replace
+                if (not replace or
+                    (cuda_only and
+                     get_context(context_name).kind != 'cuda')):
+                    return False
+
+                new_op = maker(node, context_name)
+                # This is needed as sometimes new_op inherits from OP.
                if new_op and new_op != node.op:
                    if isinstance(new_op, theano.Op):
+                        # tag the inputs with the context in case
+                        # the context was derived from the outputs
+                        def tag(i, ctx):
+                            i.tag.context_name = ctx
+                            return i
+                        inputs = [tag(i, context_name) for i in node.inputs]
                        return [safe_to_cpu(o) for o in
-                                    new_op(*node.inputs, return_list=True)]
+                                new_op(*inputs, return_list=True)]
                    elif isinstance(new_op, (tuple, list)):
                        return [safe_to_cpu(o) for o in new_op]
                    else:  # suppose it is a variable on the GPU
@@ -146,35 +166,81 @@ class InputToGpuOptimizer(Optimizer):

            if (len(input.clients) == 1 and
                (input.clients[0][0] == 'output' or
-                 input.clients[0][0].op == gpu_from_host)):
+                 isinstance(input.clients[0][0].op, GpuFromHost))):
                continue

+            ctx_name = getattr(input.tag, 'context_name', None)
            try:
-                new_input = host_from_gpu(gpu_from_host(input))
+                new_input = host_from_gpu(GpuFromHost(ctx_name)(input))
                fgraph.replace_validate(input, new_input,
                                        "InputToGpuOptimizer")
            except TypeError:
                # This could fail if the inputs are not TensorTypes
                pass
+            except ValueError:
+                # If there is no context tag and no default context
+                # then it stays on the CPU
+                if not hasattr(input.tag, 'context_name'):
+                    raise
+                pass
+

 gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')


-@local_optimizer([gpu_from_host, host_from_gpu])
-def local_cut_gpu_host_gpu(node):
-    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
-        return [node.inputs[0].owner.inputs[0]]
-    if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
-        return [node.inputs[0].owner.inputs[0]]
-    return False
-gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
+@local_optimizer([GpuFromHost, GpuToGpu, host_from_gpu])
+def local_cut_gpu_transfers(node):
+    # gpu[ab] -> host -> gpub
+    if (isinstance(node.op, GpuFromHost) and
+            node.inputs[0].owner and
+            node.inputs[0].owner.op == host_from_gpu):
+        other = node.inputs[0].owner.inputs[0]
+        if node.op.context_name == other.type.context_name:
+            return [other]
+        else:
+            return [GpuToGpu(node.op.context_name)(other)]
+
+    # ? -> gpua -> host
+    elif (node.op == host_from_gpu and
+          node.inputs[0].owner):
+        n2 = node.inputs[0].owner
+
+        # host ->
+        if isinstance(n2.op, GpuFromHost):
+            return [n2.inputs[0]]
+
+        # gpub ->
+        if isinstance(n2.op, GpuToGpu):
+            return [host_from_gpu(n2.inputs[0])]
+
+    # ? -> gpua -> gpub
+    elif isinstance(node.op, GpuToGpu):
+        # Transfer within same context
+        if node.inputs[0].type.context_name == node.op.context_name:
+            return [node.inputs[0]]
+
+        if node.inputs[0].owner:
+            n2 = node.inputs[0].owner
+
+            # host ->
+            if isinstance(n2.op, GpuFromHost):
+                return [GpuFromHost(node.op.context_name)(n2.inputs[0])]
+
+            # gpuc ->
+            if isinstance(n2.op, GpuToGpu):
+                if node.op.context_name == n2.inputs[0].type.context_name:
+                    return [n2.inputs[0]]
+                else:
+                    return [node.op(n2.inputs[0])]
+
+gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
                        'fast_compile', 'fast_run', 'inplace', 'gpuarray')
 gpu_cut_copies.register('cut_gpua_constant_transfers',
                        tensor.opt.constant_folding,
                        'fast_compile', 'fast_run', 'gpuarray')
 optdb['canonicalize'].register('local_cut_gpua_host_gpua',
-                               local_cut_gpu_host_gpu,
+                               local_cut_gpu_transfers,
                               'fast_compile', 'fast_run', 'gpuarray')


@@ -187,6 +253,11 @@ def local_gpuaalloc2(node):
    Moves an alloc that is an input to join to the gpu.

    """
+    try:
+        get_context(None)
+    except ValueError:
+        # If there is no default context then we do not perform the move here.
+        return
    if (isinstance(node.op, tensor.Alloc) and
        all(c != 'output' and
            c.op == tensor.join and
@@ -194,23 +265,13 @@ def local_gpuaalloc2(node):
                i.owner.op in [host_from_gpu, tensor.alloc]
                for i in c.inputs[1:])
            for c, idx in node.outputs[0].clients)):
-        return [host_from_gpu(gpu_alloc(*node.inputs))]
+        return [host_from_gpu(GpuAlloc(None)(*node.inputs))]


 @register_opt('fast_compile')
 @op_lifter([tensor.Alloc])
-def local_gpuaalloc(node):
-    new_out = gpu_alloc(*node.inputs)
-    # We need to hide new broadcastable dimensions because
-    # ReplaceValidate doesn't like when they change.
-    if new_out.broadcastable != node.outputs[0].broadcastable:
-        # but if a dim is suddenly not broadcastable anymore then that's a bug
-        for b_old, b_new in zip(node.outputs[0].broadcastable,
-                                new_out.broadcastable):
-            assert b_new or (not b_old)
-        new_out = tensor.patternbroadcast(new_out,
-                                          node.outputs[0].broadcastable)
-    return (new_out,)
+def local_gpuaalloc(node, context_name):
+    return GpuAlloc(context_name)(*node.inputs)


 @register_opt()
@@ -221,8 +282,8 @@ def local_gpualloc_memset_0(node):
        if (isinstance(inp, GpuArrayConstant) and
                inp.data.size == 1 and
                (numpy.asarray(inp.data) == 0).all()):
-            new_out = GpuAlloc(memset_0=True)(*node.inputs)
-            return [new_out]
+            new_op = GpuAlloc(node.op.context_name, memset_0=True)
+            return [new_op(*node.inputs)]


 @register_opt()
@@ -240,7 +301,7 @@ def local_gpu_contiguous_gpu_contiguous(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.Reshape])
-def local_gpureshape(node):
+def local_gpureshape(node, context_name):
    op = node.op
    name = op.name
    if name:
@@ -251,14 +312,14 @@ def local_gpureshape(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.Rebroadcast])
-def local_gpu_rebroadcast(node):
+def local_gpu_rebroadcast(node, context_name):
    if isinstance(node.inputs[0].owner.op, HostFromGpu):
        return node.op(node.inputs[0].owner.inputs[0])


 @register_opt('fast_compile')
 @op_lifter([tensor.Flatten])
-def local_gpuflatten(node):
+def local_gpuflatten(node, context_name):
    op = node.op
    shp = []
    if op.outdim != 1:
@@ -271,7 +332,7 @@ def local_gpuflatten(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.Elemwise])
-def local_gpu_elemwise(node):
+def local_gpu_elemwise(node, context_name):
    op = node.op
    scal_op = op.scalar_op
    name = op.name
@@ -344,28 +405,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,

 @register_opt('fast_compile')
 @op_lifter([tensor.DimShuffle])
-def local_gpua_dimshuffle(node):
+def local_gpua_dimshuffle(node, context_name):
    return GpuDimShuffle(node.op.input_broadcastable,
                         node.op.new_order)


 @register_opt('fast_compile')
 @op_lifter([tensor.SpecifyShape])
-def local_gpua_specifyShape(node):
+def local_gpua_specifyShape(node, context_name):
    if isinstance(node.inputs[0].type, GpuArrayType):
        return
-    inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:]
+    inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:]
    return tensor.specify_shape(*inp)


 @register_opt('fast_compile')
 @op_lifter([theano.compile.ops.Shape])
-def local_gpua_shape(node):
+def local_gpua_shape(node, context_name):
    # op_lifter will call this opt too frequently as the output is
    # always on the CPU.
    if isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return [gpu_from_host(node.inputs[0]).shape]
+    return [GpuFromHost(context_name)(node.inputs[0]).shape]


 def gpu_print_wrapper(op, cnda):
@@ -374,7 +435,7 @@ def gpu_print_wrapper(op, cnda):

 @register_opt('fast_compile')
 @op_lifter([tensor.printing.Print])
-def local_gpu_print_op(node):
+def local_gpu_print_op(node, context_name):
    x, = node.inputs
    gpu_x, = x.owner.inputs
    new_op = node.op.__class__(global_fn=gpu_print_wrapper)
@@ -404,9 +465,14 @@ def local_gpu_pdbbreakpoint_op(node):

            input_is_from_gpu = (inp.owner and
                                 isinstance(inp.owner.op, HostFromGpu))
-            output_goes_to_gpu = any([c[0] != "output" and
-                                      isinstance(c[0].op, GpuFromHost)
-                                      for c in out.clients])
+            output_goes_to_gpu = False
+            for c in out.clients:
+                if c == 'output':
+                    continue
+                if isinstance(c[0].op, GpuFromHost):
+                    output_goes_to_gpu = True
+                    context_name = c[0].op.context_name
+                    break

            if input_is_from_gpu:
                # The op should be applied on the GPU version of the input
@@ -415,7 +481,7 @@ def local_gpu_pdbbreakpoint_op(node):

            elif output_goes_to_gpu:
                # The input should be transfered to the gpu
-                new_inputs.append(gpu_from_host(inp))
+                new_inputs.append(GpuFromHost(context_name)(inp))
                input_transfered.append(True)

            else:
@@ -447,7 +513,7 @@ def local_gpu_pdbbreakpoint_op(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.Join])
-def local_gpua_join(node):
+def local_gpua_join(node, context_name):
    return gpu_join


@@ -462,13 +528,13 @@ def local_gpuajoin_1(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.Split])
-def local_gpua_split(node):
+def local_gpua_split(node, context_name):
    return GpuSplit(node.op.len_splits)


 @register_opt('fast_compile')
 @op_lifter([tensor.Subtensor])
-def local_gpua_subtensor(node):
+def local_gpua_subtensor(node, context_name):
    x = node.inputs[0]
    if (x.owner and isinstance(x.owner.op, HostFromGpu)):
        gpu_x = x.owner.inputs[0]
@@ -482,14 +548,14 @@ def local_gpua_subtensor(node):
                        for n, _ in node.outputs[0].clients]):
                    return
                else:
-                    return [host_from_gpu(gpu_from_host(node.outputs[0]))]
+                    return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]

    return GpuSubtensor(node.op.idx_list)


 @register_opt('fast_compile')
 @op_lifter([tensor.IncSubtensor])
-def local_gpua_incsubtensor(node):
+def local_gpua_incsubtensor(node, context_name):
    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
                           node.op.set_instead_of_inc,
                           node.op.destroyhandler_tolerate_aliased)
@@ -497,16 +563,16 @@ def local_gpua_incsubtensor(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedSubtensor1])
-def local_gpua_advanced_subtensor(node):
+def local_gpua_advanced_subtensor(node, context_name):
    return GpuAdvancedSubtensor1()


 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
-def local_gpua_advanced_incsubtensor(node):
+def local_gpua_advanced_incsubtensor(node, context_name):

-    # This optimization is disabled if cuda is not active
-    if pygpu.get_default_context().kind != "cuda":
+    # This is disabled on non-cuda contexts
+    if get_context(context_name).kind != 'cuda':
        return None

    x, y, ilist = node.inputs
@@ -535,17 +601,19 @@ def local_gpua_advanced_incsubtensor(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
-def local_gpua_careduce(node):
+def local_gpua_careduce(node, context_name):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if dev.startswith('opencl'):
+        ctx = get_context(context_name)
+        if ctx.kind == 'opencl':
            op = GpuCAReduceCPY
            if node.op.scalar_op not in [scalar.add, scalar.mul]:
                # We don't support yet all reduction with cpy code.
                return
-        else:
+        elif ctx.kind == 'cuda':
            op = GpuCAReduceCuda
+        else:
+            return False
        x, = node.inputs

        greduce = op(
@@ -556,7 +624,7 @@ def local_gpua_careduce(node):
        # We need to have the make node called, otherwise the mask can
        # be None
        if (op is GpuCAReduceCPY or
-                gvar.owner.op.supports_c_code([gpu_from_host(x)])):
+                gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
@@ -596,7 +664,7 @@ def local_gpua_careduce(node):
                acc_dtype=getattr(node.op, 'acc_dtype', None))

            reshaped_x = x.reshape(tensor.stack(new_in_shp))
-            gpu_reshaped_x = gpu_from_host(reshaped_x)
+            gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x)
            gvar = greduce(gpu_reshaped_x)
            # We need to have the make node called, otherwise the mask can
            # be None
@@ -615,19 +683,19 @@ def local_gpua_careduce(node):

 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
-def local_gpua_gemv(node):
+def local_gpua_gemv(node, context_name):
    return GpuGemv(inplace=node.op.inplace)


 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemm])
-def local_gpua_gemm(node):
+def local_gpua_gemm(node, context_name):
    return GpuGemm(inplace=node.op.inplace)


 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Dot])
-def local_gpua_hgemm(node):
+def local_gpua_hgemm(node, context_name):
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
        _logger.warning("Not performing dot of float16 on the GPU since "
@@ -639,7 +707,8 @@ def local_gpua_hgemm(node):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = node.inputs[0].fgraph
-        C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph),
+        C = GpuAllocEmpty(dtype='float16', context_name=context_name)(
+            shape_i(A, 0, fgraph),
            shape_i(B, 1, fgraph))
        return gpugemm_no_inplace(C, 1.0, A, B, 0.0)

@@ -658,49 +727,49 @@ def local_gpuagemm_output_merge(node, *inputs):

 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
-def local_gpua_ger(node):
-    return GpuGer(destructive=node.op.destructive)
+def local_gpua_ger(node, context_name):
+    return GpuGer(inplace=node.op.destructive)


 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Dot22])
-def local_gpua_dot22(node):
+def local_gpua_dot22(node, context_name):
    return gpu_dot22


 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Eye])
-def local_gpua_eye(node):
-    return GpuEye(dtype=node.op.dtype)
+def local_gpua_eye(node, context_name):
+    return GpuEye(dtype=node.op.dtype, context_name=context_name)


 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
-def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
+def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
    return GpuCrossentropySoftmaxArgmax1HotWithBias()


 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
-def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
+def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
    return GpuCrossentropySoftmax1HotWithBiasDx()


 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.Softmax], cuda_only=True)
-def local_gpua_softmax(node):
+def local_gpua_softmax(node, context_name):
    return GpuSoftmax()


 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
-def local_gpua_softmaxwithbias(node):
+def local_gpua_softmaxwithbias(node, context_name):
    return GpuSoftmaxWithBias()


 @register_opt('fast_compile')
 @op_lifter([theano.tensor.opt.Assert])
-def local_assert(node):
+def local_assert(node, context_name):
    if (node.inputs[0].owner and
            isinstance(node.inputs[0].owner.op, HostFromGpu)):
        return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0],
@@ -708,21 +777,14 @@ def local_assert(node):


 @register_opt('fast_compile')
-@op_lifter([gpu_from_host, ConvOp])
-def local_gpu_conv(node):
-    """
-    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
-
-    conv(host_from_gpu) -> host_from_gpu(gpu_conv)
-
-    """
+@op_lifter([ConvOp])
+def local_gpu_conv(node, context_name):
    def GpuConvOp_from_ConvOp(op):
        logical_img_hw = None

        if op.kshp_logical is not None and op.kshp_logical != op.kshp:
            return None
-        # print op.kshp, op.imshp[1:3]
-        # print op.kshp_logical, logical_img_hw
+
        ret = GpuConv(border_mode=op.out_mode,
                      subsample=(op.dx, op.dy),
                      logical_img_hw=logical_img_hw,
@@ -735,13 +797,10 @@ def local_gpu_conv(node):
                      imshp=op.imshp,
                      nkern=op.nkern,
                      bsize=op.bsize,
-                      fft_opt=op.fft_opt
-                      )
+                      fft_opt=op.fft_opt)
        if op.imshp_logical is not None:
            logical_img_hw = op.imshp_logical[1:3]
            if logical_img_hw != op.imshp[1:3]:
-                # this case is not implemented
-                # return None
                rstride = int(numpy.ceil(op.imshp_logical[1] /
                                         float(op.imshp[1])))
                cstride = int(numpy.ceil(op.imshp_logical[2] /
@@ -752,7 +811,7 @@ def local_gpu_conv(node):
                                       img.shape[0], *op.imshp_logical)
                    img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
                                               img)
-                    img = gpu_from_host(img)
+                    img = GpuFromHost(context_name)(img)
                    return ret(img, kern)

                return make_graph
@@ -779,15 +838,13 @@ def local_gpu_conv(node):
    gpu_conv = GpuConvOp_from_ConvOp(node.op)
    if gpu_conv is None:
        return
-    out = gpu_conv(gpu_from_host(img),
-                   gpu_from_host(kern))
-    # in some case the ConvOp broadcast the last 2 dimensions
-    # differently then the gpu ConvOp
-    out = tensor.patternbroadcast(
-        host_from_gpu(out),
-        node.outputs[0].broadcastable)
-    # op_lifter want the output on the GPU.
-    out = gpu_from_host(out)
+    out = gpu_conv(GpuFromHost(context_name)(img),
+                   GpuFromHost(context_name)(kern))
+    assert isinstance(out.type, GpuArrayType)
+    # Make sure to keep the broadcastable pattern of the original
+    # convolution even if we might gain or lose some due to different
+    # information at the node level.
+    out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
    out.values_eq_approx = values_eq_approx
    return [out]

@@ -818,9 +875,10 @@ def local_gpu_elemwise_careduce(node):
                                pre_scalar_op=scalar.basic.sqr)(inp)]


-def tensor_to_gpu(x):
+def tensor_to_gpu(x, context_name):
    if isinstance(x.type, tensor.TensorType):
        y = GpuArrayType(broadcastable=x.type.broadcastable,
+                         context_name=context_name,
                         dtype=x.type.dtype)()
        if x.name:
            y.name = x.name + '[Gpua]'
@@ -842,6 +900,7 @@ def gpu_safe_new(x, tag=''):
        nw_name = x.name + tag
    else:
        nw_name = None
+
    if isinstance(x, theano.Constant):
        return x.clone()

@@ -870,7 +929,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):

 @register_opt('scan', 'fast_compile')
 @op_lifter([scan_op.Scan])
-def local_scan_to_gpua(node):
+def local_scan_to_gpua(node, context_name):
    info = copy.deepcopy(node.op.info)
    if info.get('gpua', False):
        return
@@ -882,20 +941,20 @@ def local_scan_to_gpua(node):
         node.op.n_mit_sot +
         node.op.n_sit_sot +
         node.op.n_shared_outs)
-    nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]]
+    nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
    b = e
    e = e + node.op.n_nit_sot
    nw_ins += node.inputs[b:e]
-    nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]]
-    scan_ins = [tensor_to_gpu(x) for x in node.op.inputs]
+    nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
+    scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]

    # The inner output corresponding to the looping condition should not be
    # moved to the gpu
    if node.op.info['as_while']:
-        scan_outs = [safe_to_gpu(x) for x in node.op.outputs[:-1]]
+        scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
        scan_outs += [node.op.outputs[-1]]
    else:
-        scan_outs = [safe_to_gpu(x) for x in node.op.outputs]
+        scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
    scan_outs = scan_utils.clone(
        scan_outs,
        replace=list(zip(node.op.inputs,
@@ -909,12 +968,25 @@ def local_scan_to_gpua(node):
    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
    info['gpu_hash'] = hash(_cmodule_key)

+    def typebuild(dtype, broadcastable, context_name=context_name):
+        return GpuArrayType(dtype=dtype, broadcastable=broadcastable,
+                            context_name=context_name)
+
    nw_op = scan_op.Scan(scan_ins, scan_outs, info,
-                         typeConstructor=GpuArrayType).make_node(*nw_ins)
+                         typeConstructor=typebuild).make_node(*nw_ins)
    return nw_op.outputs

+
+def _scan_type_infer(node):
+    context_name = infer_context_name(*node.inputs)
+
+    def typebuild(dtype, broadcastable, context_name=context_name):
+        return GpuArrayType(dtype=dtype, broadcastable=broadcastable,
+                            context_name=context_name)
+    return typebuild
+
 optdb.register('gpua_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
+               scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer,
                                             gpua_flag=True),
               75,
               'gpuarray',

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
    function can be as simple as:

        def maker(node, inputs):
-            return node.op.__class__(inplace=True)(*inputs)
+            return [node.op.__class__(inplace=True)(*inputs)]

    Parameters
    ----------
@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
+                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
+                                         alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/sandbox/gpuarray/pycuda_helper.py
+++ b/theano/sandbox/gpuarray/pycuda_helper.py
-try:
-    from pycuda.driver import Context
-    if not hasattr(Context, 'attach'):
-        raise ImportError('too old')
-except ImportError:
-    Context = None
-
-pycuda_initialized = False
-pycuda_context = None
-
-
-def ensure_pycuda_context():
-    global pycuda_context, pycuda_initialized
-    if not pycuda_initialized:
-        if Context is None:
-            raise RuntimeError("PyCUDA not found or too old.")
-        else:
-            pycuda_context = Context.attach()
-            import atexit
-            atexit.register(pycuda_context.detach)
-            pycuda_initialized = True
-    return pycuda_context
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 from __future__ import print_function

-import copy
 import os
+import copy

 import numpy

 import theano
-from theano import tensor, gof, config
-from theano.gof.utils import MethodNotDefined
+from theano import tensor, gof
 from six.moves import StringIO
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
@@ -19,7 +18,8 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .elemwise import GpuElemwise


@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
    _f16_ok = True

    def make_node(self, x, *inputs):
+        ctx_name = infer_context_name(x)
        rval = tensor.Subtensor.make_node(self, x, *inputs)
        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
-                             broadcastable=rval.outputs[0].type.broadcastable)
-        x = as_gpuarray_variable(x)
+                             broadcastable=rval.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])

    def perform(self, node, inputs, out_):
@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)

    def make_node(self, x, y, *inputs):
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        op = copy.copy(self)
        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
        op.create_iadd_node(ret)
        return ret

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
    def create_iadd_node(self, node):
        # We store a iadd_node in the op that contain the info needed
        # for the inplace add.
@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        iadd_node = gop(xview, y).owner
        self.iadd_node = iadd_node

-    def perform(self, node, inputs, out_):
+    def perform(self, node, inputs, out_, ctx):
        out, = out_
        x, y = inputs[:2]
        indices = list(reversed(inputs[2:]))
@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
                                  %(view_ndim)s,
                                  dims,
                                  xview_strides,
-                                  pygpu_default_context(),
+                                  %(x)s->context,
                                  1,
                                  (PyObject *)%(x)s,
                                  (PyObject *)&PyGpuArrayType);
@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()

-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        gop = self.iadd_node.op
        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
+        ret = gop.c_support_code_struct(self.iadd_node, sub_name)
        ret += """
        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
                                                    PyGpuArrayObject* src){
@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """ % locals()
        inputs = ["dst", "src"]
        outputs = ["ret"]
-        sub = {"fail": "return NULL;"}
+        sub = {"fail": "return NULL;", "context": "dst->context"}
        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
        ret += """
-            return dst;
+            return ret;
+
        }
        """
        return ret
@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):

 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
    def make_node(self, x, ilist):
-        x_ = as_gpuarray_variable(x)
+        ctx_name = infer_context_name(x, ilist)
+        x_ = as_gpuarray_variable(x, ctx_name)

        ilist__ = tensor.as_tensor_variable(ilist)
        if ilist__.type.dtype[:3] not in ('int', 'uin'):
@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
        if ilist__.type.dtype != 'int64':
            ilist__ = tensor.cast(ilist__, 'int64')

-        ilist_ = as_gpuarray_variable(ilist__)
+        ilist_ = as_gpuarray_variable(ilist__, ctx_name)

        if ilist_.type.dtype != 'int64':
            raise TypeError('index must be int64')
@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
        bcast = ilist_.broadcastable + x_.broadcastable[1:]
        return gof.Apply(self, [x_, ilist_],
                         [GpuArrayType(dtype=x.dtype,
+                                       context_name=ctx_name,
                                       broadcastable=bcast)()])

    def perform(self, node, inp, out_):
@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
    """

    def make_node(self, x, y, ilist):
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x_ = as_gpuarray_variable(x, ctx_name)
+        y_ = as_gpuarray_variable(y, ctx_name)
        ilist_ = tensor.as_tensor_variable(ilist)

        assert x_.type.dtype == y_.type.dtype
@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
    only avail on compute capability 2.0 and more recent.

    """
-
    _f16_ok = True

    def make_node(self, x, y, ilist):
        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
        the index are of type long.
        """
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
-        ilist_ = as_gpuarray_variable(ilist)
+        ctx_name = infer_context_name(x, y, ilist)
+        x_ = as_gpuarray_variable(x, ctx_name)
+        y_ = as_gpuarray_variable(y, ctx_name)
+        ilist_ = as_gpuarray_variable(ilist, ctx_name)

        assert x_.type.dtype == y_.type.dtype
        assert x_.type.ndim >= y_.type.ndim
@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):

        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
+    def perform(self, node, inp, out, ctx):
+        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
+
    def c_code_cache_version(self):
        return (6,)

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<numpy_compat.h>', '<gpuarray_helper.h>',
+        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
                '<gpuarray/types.h>']

    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        res = [os.path.dirname(__file__)]
-        if cuda_root:
-            res.append(os.path.join(cuda_root, 'include'))
-        return res
+        return [os.path.dirname(__file__)]

    def c_code(self, node, name, inputs, outputs, sub):
-        active_device_no = theano.sandbox.cuda.active_device_number()
-        device_properties = theano.sandbox.cuda.device_properties
-        compute_capability = device_properties(active_device_no)['major']
-        if ((self.set_instead_of_inc) or
-                (node.inputs[0].ndim != node.inputs[1].ndim) or
-                (node.inputs[0].ndim != 2) or
-                (compute_capability < 2)):
+        ctx = self.get_context(node)
+        if ctx.kind != 'cuda':
+            raise NotImplementedError("cuda only")
+        if (self.set_instead_of_inc or
+                node.inputs[0].ndim != node.inputs[1].ndim or
+                node.inputs[0].ndim != 2 or
+                ctx.bin_id[-2] < '2'):
            raise NotImplementedError("This case does not have C code yet.")

        x = inputs[0]
@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]

-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        itemsize_out = numpy.dtype(dtype_out).itemsize
        k_var = "k_vector_add_fast_" + nodename

-        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
+        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                     PyGpuArrayObject* py_other,
                                     PyGpuArrayObject *indices_arr)

--- a/theano/sandbox/gpuarray/tests/config.py
+++ b/theano/sandbox/gpuarray/tests/config.py
+from nose.plugins.skip import SkipTest
+
+import theano.sandbox.gpuarray
+
+if theano.sandbox.gpuarray.pygpu is None:
+    raise SkipTest("pygpu not installed")
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    import theano.sandbox.cuda as cuda_ndarray
+    if cuda_ndarray.cuda_available:
+        cuda_ndarray.use('gpu', default_to_move_computation_to_gpu=False,
+                         move_shared_float32_to_gpu=False,
+                         enable_cuda=False)
+        theano.sandbox.gpuarray.init_dev('cuda')
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+
+test_ctx_name = None
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
 from theano.tensor.tests import test_basic
 from theano.tensor.tests.test_basic import rand, safe_make_node
 from theano.tests import unittest_tools as utt
-from theano.tests.unittest_tools import SkipTest

-import theano.sandbox.gpuarray
-
-from ..type import (GpuArrayType,
+from ..type import (GpuArrayType, get_context,
                    gpuarray_shared_constructor)
 from ..basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
-    gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
+    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape,
+    GpuAlloc, GpuAllocEmpty, GpuContiguous,
    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
 from ..subtensor import GpuSubtensor

-import theano.sandbox.cuda as cuda_ndarray
-
-try:
-    from pygpu import gpuarray
-except:
-    pass
-
-if theano.sandbox.gpuarray.pygpu is None:
-    raise SkipTest("pygpu not installed")
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name

-# If you are writing a new test file, don't copy this code, but rather
-# import stuff from this file (like mode_with_gpu) to reuse it.
-if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
-    if not cuda_ndarray.use.device_number:
-        # We should not enable all the use like the flag device=gpu,
-        # as many tests don't work in that setup.
-        cuda_ndarray.use('gpu',
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False)
-    theano.sandbox.gpuarray.init_dev('cuda')
-
-if not theano.sandbox.gpuarray.pygpu_activated:
-    raise SkipTest("pygpu disabled")
+from pygpu import gpuarray

 utt.seed_rng()
 rng = numpy.random.RandomState(seed=utt.fetch_seed())

-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
-

 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
    cls = kwargs.pop('cls', None)
    if len(kwargs) != 0:
        raise TypeError('Unexpected argument %s', list(kwargs.keys())[0])
-    return gpuarray.array(r, dtype=dtype, cls=cls)
+    return gpuarray.array(r, dtype=dtype, cls=cls,
+                          context=get_context(test_ctx_name))


 def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,

        def test_all(self):
            if skip:
+                from nose.plugins.skip import SkipTest
                raise SkipTest(skip)

            for testname, inputs in iteritems(cases):
@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')

    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
+    gv = gpuarray.array(av, context=get_context(test_ctx_name))

-    f = theano.function([a], gpu_from_host(a))
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)

@@ -218,12 +189,12 @@ def test_transfer_strided():
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')

    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
-    gv = gpuarray.array(av)
+    gv = gpuarray.array(av, context=get_context(test_ctx_name))

    av = av[:, ::2]
    gv = gv[:, ::2]

-    f = theano.function([a], gpu_from_host(a))
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)

@@ -233,14 +204,14 @@ def test_transfer_strided():


 def gpu_alloc_expected(x, *shp):
-    g = gpuarray.empty(shp, dtype=x.dtype)
+    g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
    g[:] = x
    return g

 GpuAllocTester = makeTester(
    name="GpuAllocTester",
    op=alloc,
-    gpu_op=gpu_alloc,
+    gpu_op=GpuAlloc(test_ctx_name),
    cases=dict(
        correct01=(rand(), numpy.int32(7)),
        # just gives a DeepCopyOp with possibly wrong results on the CPU
@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
    dtype = "float32"
    mode = mode_with_gpu
    shared = staticmethod(gpuarray_shared_constructor)
-    allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()]
+    allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), T.Alloc()]


 def test_alloc_empty():
    for dt in ['float32', 'int8']:
-        f = theano.function([], GpuAllocEmpty(dt)(2, 3))
+        f = theano.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
        assert len(f.maker.fgraph.apply_nodes) == 1
        out = f()
        assert out.shape == (2, 3)
        assert out.dtype == dt

-    f = theano.function([], [GpuAllocEmpty('uint64')(3, 2),
-                             GpuAllocEmpty('uint64')(3, 2)])
+    f = theano.function([], [GpuAllocEmpty('uint64', test_ctx_name)(3, 2),
+                             GpuAllocEmpty('uint64', test_ctx_name)(3, 2)])
    out = f()
    assert out[0].shape == (3, 2)
    assert out[0].dtype == 'uint64'
@@ -284,7 +255,7 @@ def test_alloc_empty():

 def test_shape():
    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
-    v = gpuarray.zeros((3, 4, 5), dtype='float32')
+    v = gpuarray.zeros((3, 4, 5), dtype='float32', context=get_context(test_ctx_name))
    f = theano.function([x], x.shape)
    topo = f.maker.fgraph.toposort()
    assert numpy.all(f(v) == (3, 4, 5))
@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
-                          dtype='float32')
+                          dtype='float32',
+                          context=get_context(test_ctx_name))

-    f = theano.function([a], gpu_from_host(a), mode=m)
-    assert gpu_from_host in [x.op
-                             for x in f.maker.fgraph.toposort()]
-    f = theano.function([a], gpu_from_host(a).shape, mode=m)
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
+    assert any(isinstance(x.op, GpuFromHost)
+               for x in f.maker.fgraph.toposort())
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, T.opt.Shape_i)
    assert isinstance(topo[1].op, T.opt.Shape_i)

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
 from theano.tensor.tests.test_blas import TestGer, BaseGemv

 from .. import gpuarray_shared_constructor
-from .test_basic_ops import (makeTester, rand,
-                             mode_with_gpu)
+from .config import mode_with_gpu
+from .test_basic_ops import makeTester, rand

 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace,
@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
        self.ops = [gpuger_no_inplace, gpuger_inplace]

    def clone(self, op):
-        return GpuGer(destructive=op.destructive)
+        return GpuGer(inplace=op.inplace)


 GpuDot22Tester = makeTester(

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -14,8 +14,8 @@ from theano import tensor
 from theano.tests.unittest_tools import seed_rng

 # We let that import do the init of the back-end if needed.
-from .test_basic_ops import mode_with_gpu
-from ..type import GpuArrayType
+from .config import mode_with_gpu, test_ctx_name
+from ..type import GpuArrayType, get_context
 from ..conv import GpuConv
 from theano.sandbox.gpuarray import dnn

@@ -28,7 +28,7 @@ try:
 except ImportError:
    pass

-gftensor4 = GpuArrayType('float32', [False] * 4)
+gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)


 def py_conv_valid_numpy(img, kern):
@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
            numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
            numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
-    img = pygpu.array(npy_img)
-    kern = pygpu.array(npy_kern)
+    img = pygpu.array(npy_img, context=get_context(test_ctx_name))
+    kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
 from .. import dnn
 from ..basic_ops import GpuAllocEmpty

-from .test_basic_ops import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from . import test_nnet


 def test_dnn_conv_desc_merge():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    kern_shp = T.as_tensor_variable(
        numpy.asarray([3, 1, 2, 2]).astype('int64'))
@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge():

 def test_dnn_conv_merge():
    # This test that we merge correctly multiple dnn_conv.
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img_shp = [2, 5, 6, 8]
    kern_shp = [3, 5, 5, 6]
@@ -80,7 +80,7 @@ def test_dnn_conv_inplace():
    GpuAllocEmpty get merged together.

    """
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img_shp = [2, 5, 6, 8]
    kern_shp = [3, 5, 5, 6]
@@ -105,7 +105,7 @@ def test_dnn_conv_inplace():
    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2

    # Test grad w op
-    out = GpuAllocEmpty(kern.dtype)(*kern.shape)
+    out = GpuAllocEmpty(kern.dtype, test_ctx_name)(*kern.shape)
    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
@@ -116,7 +116,7 @@ def test_dnn_conv_inplace():
    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2

    # Test grad i op
-    out = GpuAllocEmpty(img.dtype)(*img.shape)
+    out = GpuAllocEmpty(img.dtype, test_ctx_name)(*img.shape)
    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,


 def test_pooling():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

    x = T.ftensor4()
@@ -269,7 +269,7 @@ def test_pooling():


 def test_pooling_opt():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

    x = T.fmatrix()
@@ -318,7 +318,7 @@ def test_dnn_tag():
            max_pool_2d(x, ds=(2, 2), ignore_border=True),
            mode=mode_with_gpu.including("cudnn"))
    except (AssertionError, RuntimeError):
-        assert not dnn.dnn_available()
+        assert not dnn.dnn_available(test_ctx_name)
        raised = True
    finally:
        theano.config.on_opt_error = old
@@ -327,7 +327,7 @@ def test_dnn_tag():
        logging.getLogger('theano').addHandler(theano.logging_default_handler)

    if not raised:
-        assert dnn.dnn_available()
+        assert dnn.dnn_available(test_ctx_name)
        assert any([isinstance(n.op, dnn.GpuDnnPool)
                    for n in f.maker.fgraph.toposort()])

@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        self.mode = mode_with_gpu

    def test_softmax(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        t = T.ftensor4('t')
        rand_tensor = numpy.asarray(
@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        )

    def test_conv(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_conv_gradw(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_conv_gradi(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_pool(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        img_val = numpy.asarray(
@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            )

    def test_pool_grad(self):
-        if not dnn.dnn_available():
+        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
        img = T.ftensor4('img')
        img_grad = T.ftensor4('img_grad')
@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester):

 # this has been a problem in the past
 def test_dnn_conv_border_mode():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode():


 def test_dnn_conv_alpha_output_merge():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
@@ -678,7 +678,7 @@ def test_dnn_conv_grad():


 def test_version():
-    if not dnn.dnn_available():
+    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    assert isinstance(dnn.version(), int)


--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -4,19 +4,19 @@ import theano
 from theano import scalar, gof
 from theano.tests.unittest_tools import SkipTest, assert_allclose

-from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
-                                               test_CAReduce, T_reduce_dtype)
+from theano.tensor.tests import test_elemwise

-from .test_basic_ops import mode_with_gpu, rand_gpuarray
+from .config import mode_with_gpu, test_ctx_name
+from .test_basic_ops import rand_gpuarray
 from ..elemwise import (GpuElemwise, GpuDimShuffle,
                        GpuCAReduceCuda, GpuCAReduceCPY)
-from ..type import GpuArrayType
+from ..type import GpuArrayType, get_context

 from pygpu import ndgpuarray as gpuarray


 # This is acutally a test for GpuElemwise
-class test_gpu_Broadcast(test_Broadcast):
+class test_gpu_Broadcast(test_elemwise.test_Broadcast):
    op = GpuElemwise
    type = GpuArrayType
    cop = GpuElemwise
@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast):
    linkers = [gof.PerformLinker, gof.CLinker]

    def setUp(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            self.linkers = [gof.PerformLinker]

    def rand_val(self, shp):
@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))

    def test_c(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")
        super(test_gpu_Broadcast, self).test_c()

    def test_c_inplace(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")
        super(test_gpu_Broadcast, self).test_c_inplace()

@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast):
 def test_elemwise_pow():
    # Test that GpuElemwise(pow) can compile with any combination of integer
    # or float input dtype.
-    dev = theano.sandbox.gpuarray.init_dev.device
-    if not dev.startswith('cuda'):
+    if get_context(test_ctx_name).kind != 'cuda':
        raise SkipTest("Cuda specific tests")

    dtypes = ["uint8", "uint16", "uint32", "uint64",
@@ -77,11 +73,11 @@ def test_elemwise_pow():
            assert_allclose(out, expected_out)


-class test_GpuDimShuffle(test_DimShuffle):
+class test_GpuDimShuffle(test_elemwise.test_DimShuffle):
    op = GpuDimShuffle


-class test_GpuCAReduceCPY(test_CAReduce):
+class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
    dtypes = ["float32"]
    bin_dtypes = ["uint8", "int8"]
    op = GpuCAReduceCPY
@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce):

    def test_infer_shape(self):
        for dtype in self.dtypes:
-            test_CAReduce.test_infer_shape(self, dtype)
+            super(test_GpuCAReduceCPY, self).test_infer_shape(dtype)


 class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((5, 6), (1, )),
             ((5, 6), (-1, )),
             ((5, 6), (-2, )),
-             #((5, 6), ()),  #reduce on no axis(copy) isn't implemented
-             #((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
-             #((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
+             # ((5, 6), ()),  #reduce on no axis(copy) isn't implemented
+             # ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
+             # ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
             ((5, 0), None),
             ((5, 0), (0, )),
             ((5, 0), (1, )),
-             #((5, 0), ()), reduce on no axis isn't implemented
-             #((), None), reduce on no axis isn't implemented
-             #((), ()) reduce on no axis isn't implemented
+             # ((5, 0), ()), reduce on no axis isn't implemented
+             # ((), None), reduce on no axis isn't implemented
+             # ((), ()) reduce on no axis isn't implemented

             # Test all GPU cases implemented
             ((1, 0), (1,)),
@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]),  # 001
             ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]),  # 110
             ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]),  # 011
-             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+             # ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
             ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]),  # 111
             ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]),  # 111

@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):

             # test pattern implemented by reshape
             # Skip them as this test the op directly, not the optimization with reshape
-#             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
-#             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
-#             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
-#             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-#             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
-#             ((5,4,3,10,11),[1,2]),
+             # ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
+             # ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
+             # ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
+             # ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+             # ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
+             # ((5,4,3,10,11),[1,2]),
             ]
    op = GpuCAReduceCuda
    reds = [scalar.add, scalar.mul,
            scalar.maximum, scalar.minimum]
-    pre_scalar_op = scalar.sqr
+    pre_scalar_op = None

    def test_perform(self):
        return
@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):

    def setUp(self):
        super(test_GpuCAReduceCuda, self).setUp()
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")


-class T_gpureduce_dtype(T_reduce_dtype):
+class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
    op = GpuCAReduceCuda
    # Currently we don't support reduction on 0 axis
@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype):
              'float32', 'float64']

    def setUp(self):
-        dev = theano.sandbox.gpuarray.init_dev.device
-        if not dev.startswith('cuda'):
+        if get_context(test_ctx_name).kind != 'cuda':
            raise SkipTest("Cuda specific tests")



--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
-
 from theano.tensor.nnet.tests import test_neighbours
-# We let that import do the init of the back-end if needed.
-from .test_basic_ops import mode_with_gpu
+
+from .config import mode_with_gpu

 from ..neighbours import GpuImages2Neibs


--- a/theano/sandbox/gpuarray/tests/test_nerv.py
+++ b/theano/sandbox/gpuarray/tests/test_nerv.py
@@ -6,7 +6,7 @@ from theano import function
 from theano.tests import unittest_tools as utt
 from theano.tensor import vector, matrix, dot

-from .test_basic_ops import mode_with_gpu
+from .config import mode_with_gpu
 from ..nerv import Gemm16, nerv



--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -7,9 +7,7 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt

-# We let that import do the init of the back-end if needed.
-from .test_basic_ops import (mode_with_gpu,
-                             mode_without_gpu)
+from .config import mode_with_gpu, mode_without_gpu
 from ..nnet import (
    GpuCrossentropySoftmaxArgmax1HotWithBias,
    GpuCrossentropySoftmax1HotWithBiasDx,

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -4,17 +4,16 @@ import theano
 from theano import tensor
 from theano.tests.breakpoint import PdbBreakpoint
 from theano.tests import unittest_tools as utt
-from theano.tests.unittest_tools import SkipTest
 from theano.tensor.tests import test_basic

 import theano.sandbox.gpuarray
 from .. import basic_ops
-from ..type import GpuArrayType, gpuarray_shared_constructor
-from ..basic_ops import (GpuAlloc, GpuReshape, gpu_alloc,
-                         gpu_from_host, host_from_gpu)
+from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
+from ..basic_ops import GpuAlloc, GpuReshape, GpuFromHost, host_from_gpu
 from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
 from ..subtensor import GpuSubtensor
-from .test_basic_ops import rand_gpuarray, mode_with_gpu, mode_without_gpu
+
+from .config import mode_with_gpu, test_ctx_name


 def test_local_assert():
@@ -97,7 +96,7 @@ def test_flatten():


 def test_reduce():
-    dev = theano.sandbox.gpuarray.init_dev.device
+    kind = get_context(test_ctx_name).kind

    for method, param in [('sum', dict(acc_dtype='float32')),
                          ('prod', dict(acc_dtype='float32')),
@@ -113,7 +112,7 @@ def test_reduce():
        topo = f.maker.fgraph.toposort()
        ops = [type(node.op) for node in topo]

-        if dev.startswith('opencl') and method in ["max", "min"]:
+        if kind == 'opencl' and method in ["max", "min"]:
            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
        else:
            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0():
    ones = numpy.ones((2,), dtype='float32')

    # Test with 0
-    a = gpu_alloc(z, i)
+    a = GpuAlloc(test_ctx_name)(z, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0():
    assert (numpy.asarray(f(6)) == 0).all()

    # Test with 1
-    a = gpu_alloc(o, i)
+    a = GpuAlloc(test_ctx_name)(o, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0():
    assert (numpy.asarray(f(6)) == 1).all()

    # Test with 1, 1
-    a = gpu_alloc(ones, i)
+    a = GpuAlloc(test_ctx_name)(ones, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
@@ -180,7 +179,7 @@ def test_print_op():
    f = theano.function([b], theano.printing.Print()(b) * 2,
                        mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert topo[0].op == gpu_from_host
+    assert isinstance(topo[0].op, GpuFromHost)
    assert isinstance(topo[1].op, theano.printing.Print)
    assert isinstance(topo[2].op, GpuElemwise)
    assert topo[3].op == host_from_gpu
@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op():

 def test_local_gpu_elemwise_careduce():
    x = theano.tensor.matrix()
-    o = (x*x).sum()
+    o = (x * x).sum()
    f = theano.function([x], o, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
@@ -234,7 +233,7 @@ def test_local_gpu_subtensor():
    # Test multiple use of the input
    # We want the subtensor to be on the GPU to prevent multiple transfer.
    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+    f = theano.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
@@ -242,7 +241,7 @@ def test_local_gpu_subtensor():
    # Test multiple use of the input + input as output
    # We want the subtensor to be on the GPU to prevent multiple transfer.
    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+    f = theano.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert not any([type(node.op) is tensor.Subtensor for node in topo])
    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
@@ -250,7 +249,7 @@ def test_local_gpu_subtensor():
    # Test shared forced on CPU end we do computation on the output of
    # the subtensor.
    t = tensor._shared(numpy.zeros(20, "float32"))
-    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+    f = theano.function([], t[3:4] + 1, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
@@ -319,7 +318,7 @@ def test_local_gpu_elemwise():
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
-    c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32'))
+    c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype='float32'))
    f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
                        mode=mode_with_gpu)
    out = f(a_v, b_v)

--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
@@ -6,10 +6,10 @@ import theano

 from theano.tests import unittest_tools as utt
 import theano.sandbox.rng_mrg
-from ..basic_ops import gpu_from_host, GpuFromHost, HostFromGpu
+from ..basic_ops import GpuFromHost, HostFromGpu
 from ..elemwise import GpuElemwise

-from .test_basic_ops import mode_with_gpu
+from .config import mode_with_gpu, test_ctx_name


 class T_Scan(TestCase):
@@ -35,7 +35,7 @@ class T_Scan(TestCase):
                                      go_backwards=False,
                                      mode=mode)

-        output = gpu_from_host(output)
+        output = GpuFromHost(test_ctx_name)(output)
        f2 = theano.function([u, x0, W_in, W],
                             output,
                             updates=updates,
@@ -238,4 +238,4 @@ class T_Scan(TestCase):

        # I leave this to tested by debugmode, this test was anyway
        # more of does the graph compile kind of test
-        t_result = my_f()
+        my_f()
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
@@ -11,8 +11,7 @@ from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
                         GpuAdvancedIncSubtensor1)
 from ..type import gpuarray_shared_constructor

-from .test_basic_ops import mode_with_gpu
-
+from .config import mode_with_gpu


 class G_subtensor(test_subtensor.T_subtensor):

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -14,14 +14,80 @@ try:
 except ImportError:
    pass

+_context_reg = {}
+
+
+def reg_context(name, ctx):
+    """
+    Register a context by mapping it to a name.
+
+    The context must be of type `GpuContext` and the name can be
+    anything hashable (but is usually a string). Only one context can
+    be registered per name and the second registration for a given
+    name will raise an error.
+
+    Parameters
+    ----------
+    name : hashable object
+        Name to associate the context with (usually a string)
+    ctx : GpuContext
+        Context instance
+
+    """
+    if name in _context_reg:
+        raise ValueError("context name %s is already defined" % (name,))
+    if not isinstance(ctx, gpuarray.GpuContext):
+        raise TypeError("context is not GpuContext")
+    _context_reg[name] = ctx
+
+
+def get_context(name):
+    """
+    Retrive the context associated with a name.
+
+    Return the context object mapped to `ref` that was previously
+    register through :func:`reg_context`. Trying to get the context
+    for an unregistered `ref` will raise a exception.
+
+    Parameters
+    ----------
+    name : hashable object
+        Name associated with the context we want (usually a string)
+
+    """
+    if name not in _context_reg:
+        raise ValueError("context name %s not defined" % (name,))
+    return _context_reg[name]
+
+
+def list_contexts():
+    """
+    Return an iterable of all the registered context names.
+    """
+    return _context_reg.keys()
+
+
+# Private method
+def _name_for_ctx(ctx):
+    for k, v in _context_reg:
+        if v == ctx:
+            return k
+        raise ValueError('context is not registered')
+
+
+# This is a private method for use by the tests only
+def _unreg_context(name):
+    del _context_reg[name]
+

 class GpuArrayType(Type):
-    def __init__(self, dtype, broadcastable, name=None):
+    def __init__(self, dtype, broadcastable, context_name=None, name=None):
        # In case this was not provided and no global value is available
        self.dtype = str(dtype)
        self.broadcastable = tuple(bool(b) for b in broadcastable)
        self.ndim = len(self.broadcastable)
        self.name = name
+        self.context_name = context_name
        try:
            self.typecode = gpuarray.dtype_to_typecode(self.dtype)
        except gpuarray.GpuArrayException:
@@ -34,10 +100,16 @@ class GpuArrayType(Type):
        if broadcastable is None:
            broadcastable = self.broadcastable
        return self.__class__(dtype=dtype, broadcastable=broadcastable,
-                              name=self.name)
+                              context_name=self.context_name, name=self.name)
+
+    # This is a property to keep the type pickleable
+    @property
+    def context(self):
+        return get_context(self.context_name)

    def __repr__(self):
-        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
+        return "GpuArrayType<%s>(%s, %s)" % (self.context_name, self.dtype,
+                                             self.broadcastable)

    def filter(self, data, strict=False, allow_downcast=None):
        if (isinstance(data, gpuarray.GpuArray) and
@@ -54,25 +126,28 @@ class GpuArrayType(Type):
                                "got %d (dtype %s)." %
                                (self, self.typecode, self.dtype,
                                 data.typecode, str(data.dtype)))
+            if self.context != data.context:
+                raise TypeError("data context does not match type context")
            # fallthrough to ndim check
            elif (allow_downcast or
                  (allow_downcast is None and
                   type(data) == float and
                   self.dtype == config.floatX)):
                data = gpuarray.array(data, dtype=self.typecode, copy=False,
-                                  ndmin=len(self.broadcastable))
+                                      ndmin=len(self.broadcastable),
+                                      context=self.context)
        else:
            if not hasattr(data, 'dtype'):
                # This is to convert objects that don't have a dtype
                # (like lists).  We anticipate that the type below
                # will match and we pass copy=False so it won't make a
                # second object on the GPU.
-                data = gpuarray.array(data, copy=False)
+                data = gpuarray.array(data, copy=False, context=self.context)

            up_dtype = scalar.upcast(self.dtype, data.dtype)
            if up_dtype == self.dtype:
-                data = gpuarray.array(data, dtype=self.dtype,
-                                      copy=False)
+                data = gpuarray.array(data, dtype=self.dtype, copy=False,
+                                      context=self.context)
            else:
                raise TypeError("%s cannot store a value of dtype %s "
                                "without risking loss of precision." %
@@ -90,8 +165,10 @@ class GpuArrayType(Type):
        return data

    def filter_variable(self, other, allow_convert=True):
+        from theano.sandbox.gpuarray import GpuFromHost
+
        if hasattr(other, '_as_GpuArrayVariable'):
-            other = other._as_GpuArrayVariable()
+            other = other._as_GpuArrayVariable(self.context_name)

        if not isinstance(other, Variable):
            other = self.Constant(type=self, data=other)
@@ -120,7 +197,7 @@ class GpuArrayType(Type):
                                 str(self.broadcastable)))
            other = other2

-        return theano.sandbox.gpuarray.basic_ops.gpu_from_host(other)
+        return GpuFromHost(self.context_name)(other)

    @staticmethod
    def values_eq(a, b):
@@ -189,7 +266,8 @@ class GpuArrayType(Type):
        return pygpu.gpuarray.may_share_memory(a, b)

    def value_zeros(self, shape):
-        return pygpu.gpuarray.zeros(shape, dtype=self.typecode)
+        return pygpu.gpuarray.zeros(shape, dtype=self.typecode,
+                                    context=self.context)

    def make_variable(self, name=None):
        return self.Variable(self, name=name)
@@ -197,19 +275,22 @@ class GpuArrayType(Type):
    def __eq__(self, other):
        return (type(self) == type(other) and
                self.typecode == other.typecode and
-                self.broadcastable == other.broadcastable)
+                self.broadcastable == other.broadcastable and
+                self.context_name == other.context_name)

    def convert_variable(self, var):
        vt = var.type
        if (type(self) == type(vt) and
                self.typecode == vt.typecode and
                self.ndim == vt.ndim and
+                self.context_name == vt.context_name and
                all(sb == ob or ob for sb, ob in zip(self.broadcastable,
                                                     vt.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)

    def __hash__(self):
-        return (hash(self.typecode) ^ hash(self.broadcastable))
+        return hash((type(self), self.typecode, self.broadcastable,
+                     self.context_name))

    def dtype_specs(self):
        """
@@ -324,8 +405,12 @@ class _operators(_tensor_py_operators):
        from .basic_ops import host_from_gpu
        return host_from_gpu(self)

-    def _as_GpuArrayVariable(self):
+    def _as_GpuArrayVariable(self, context_name):
+        if self.type.context_name == context_name:
            return self
+        else:
+            from .basic_ops import GpuToGpu
+            return GpuToGpu(context_name)(self)


 class GpuArrayVariable(_operators, Variable):
@@ -370,7 +455,8 @@ class GpuArraySharedVariable(_operators, SharedVariable):

    def set_value(self, value, borrow=False):
        if isinstance(value, pygpu.gpuarray.GpuArray):
-            value = pygpu.gpuarray.array(value, copy=(not borrow))
+            value = pygpu.gpuarray.array(value, copy=(not borrow),
+                                         context=self.type.context)
        self.container.value = value

    def __getitem__(self, *args):
@@ -382,7 +468,8 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable

 def gpuarray_shared_constructor(value, name=None, strict=False,
                                allow_downcast=None, borrow=False,
-                                broadcastable=None):
+                                broadcastable=None,
+                                context_name=None):
    """
    SharedVariable constructor for GpuArrayType.

@@ -390,10 +477,20 @@ def gpuarray_shared_constructor(value, name=None, strict=False,
    if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
        raise TypeError('ndarray or GpuArray required')

+    try:
+        get_context(context_name)
+    except ValueError:
+        # Don't make this a hard error if we attempt to make a shared
+        # variable while there is no default context.
+        if context_name is None:
+            raise TypeError('No default context and no context specified')
+        raise
+
    if broadcastable is None:
        broadcastable = (False,) * value.ndim
-    type = GpuArrayType(value.dtype, broadcastable)
-    deviceval = pygpu.gpuarray.array(value, copy=(not borrow))
+    type = GpuArrayType(value.dtype, broadcastable, context_name=context_name)
+    deviceval = pygpu.gpuarray.array(value, copy=(not borrow),
+                                     context=type.context)
    return GpuArraySharedVariable(type=type, value=deviceval, name=name,
                                  strict=strict)

@@ -485,3 +582,63 @@ theano.compile.register_specify_shape_c_code(
    """,
    version=1,
    c_support_code_apply='#include <numpy_compat.h>')
+
+
+class GpuContextType(Type):
+    def filter(self, data, strict=False, allow_downcast=None):
+        if not isinstance(data, gpuarray.GpuContext):
+            raise TypeError('context is not a GpuContext')
+        return data
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    @staticmethod
+    def values_eq(a, b):
+        return a == b
+
+    def c_declare(self, name, sub, check_input=True):
+        return "PyGpuContextObject *%s;" % (name,)
+
+    def c_init(self, name, sub):
+        return "%s = NULL;" % (name,)
+
+    def c_extract(self, name, sub, check_input=True):
+        if check_input:
+            res = """
+if (!PyObject_TypeCheck(py_%(name)s, &PyGpuContextType)) {
+  PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
+  %(fail)s
+}
+""" % dict(name=name, fail=sub['fail'])
+        else:
+            res = ""
+        return res + """
+%(name)s = (PyGpuContextObject *)py_%(name)s;
+Py_INCREF(%(name)s);
+""" % dict(name=name)
+
+    def c_cleanup(self, name, sub):
+        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % dict(name=name)
+
+    # c_sync is intentionally not declared to prevent normal usage
+
+    def c_init_code(self):
+        return ['import_pygpu__gpuarray();']
+
+    def c_headers(self):
+        return ['<gpuarray_api.h>']
+
+    def c_header_dirs(self):
+        return [pygpu.get_include()]
+
+    def c_code_cache_version(self):
+        ver = pygpu.gpuarray.api_version()
+        return (0, ver[0])
+
+    # Variable, Contstant, ... not declared
+
+gpu_context_type = GpuContextType()
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -771,6 +771,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
    # GpuArray version
    _f16_ok = True

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    @classmethod
    def new(cls, rstate, ndim, dtype, size):
        v_size = as_tensor_variable(size)

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -1014,9 +1014,9 @@ class ScanInplaceOptimizer(Optimizer):

    """

-    def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
+    def __init__(self, typeInfer=None, gpu_flag=False, gpua_flag=False):
        Optimizer.__init__(self)
-        self.typeConstructor = typeConstructor
+        self.typeInfer = typeInfer
        self.gpu_flag = gpu_flag
        self.gpua_flag = gpua_flag

@@ -1062,10 +1062,15 @@ class ScanInplaceOptimizer(Optimizer):
                ls[idx] = deep_copy_op(ls[idx])

        inputs = ls_begin + ls + ls_end
+        if self.typeInfer is None:
+            typeConstructor = None
+        else:
+            typeConstructor = self.typeInfer(node)
+
        new_op = scan_op.Scan(op.inputs,
                              op.outputs,
                              info,
-                              typeConstructor=self.typeConstructor)
+                              typeConstructor=typeConstructor)

        # Do not call make_node for test_value
        new_outs = new_op(*inputs, **dict(return_list=True))
@@ -2325,7 +2330,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
 optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
 optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
 optdb.register('scanOp_make_inplace',
-               ScanInplaceOptimizer(typeConstructor=None,
+               ScanInplaceOptimizer(typeInfer=None,
                                    gpu_flag=False),
               75,
               'fast_run',

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -4874,6 +4874,12 @@ class T_Scan_Gpuarray(unittest.TestCase, ScanGpuTests):
    def __init__(self, *args, **kwargs):
        from theano.sandbox import gpuarray
        self.gpu_backend = gpuarray
+
+        # This is unfortunate, but required
+        def gpu_from_host(v):
+            return gpuarray.GpuFromHost(None)(v)
+        self.gpu_backend.gpu_from_host = gpu_from_host
+
        self.mode_with_gpu = mode_with_opt.including('gpuarray', 'scan')
        self.mode_with_gpu_nodebug = mode_nodebug.including('gpuarray', 'scan')
        super(T_Scan_Gpuarray, self).__init__(*args, **kwargs)

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -158,10 +158,6 @@ whitelist_flake8 = [
    "sandbox/linalg/__init__.py",
    "sandbox/linalg/tests/test_linalg.py",
    "sandbox/gpuarray/__init__.py",
-    "sandbox/gpuarray/tests/test_subtensor.py",
-    "sandbox/gpuarray/tests/test_scan.py",
-    "sandbox/gpuarray/tests/test_opt.py",
-    "sandbox/gpuarray/tests/test_elemwise.py",
    "scan_module/scan_utils.py",
    "scan_module/scan_views.py",
    "scan_module/scan.py",