Merge branch 'master' into optdb_contains

0c12e4e5 · Julien Rebetez · 4d696af3 · 2928d02a · 0c12e4e5 · 0c12e4e5
--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -5,6 +5,11 @@
 ==========================
 Frequently Asked Questions
 ==========================
+
+Does Theano support Python 3?
+------------------------------
+We support both Python 2 >= 2.6 and Python 3 >= 3.3.
+
 TypeError: object of type 'TensorVariable' has no len()
 -------------------------------------------------------


--- a/doc/install.txt
+++ b/doc/install.txt
@@ -20,11 +20,11 @@ instructions below for detailed installation steps):
        We develop mainly on 64-bit Linux machines. other architectures are
        not well-tested.

-    Python_ >= 2.6
+    Python_ 2 >= 2.6 or Python_ 3 >= 3.3
        The development package (``python-dev`` or ``python-devel``
        on most Linux distributions) is recommended (see just below).
        Python 2.4 was supported up to and including the release 0.6.
-        Python 3 is supported via 2to3 only, starting from 3.3.
+        Python 3 is supported past the 3.3 release.

    ``g++``, ``python-dev``
        Not technically required but *highly* recommended, in order to compile
@@ -147,7 +147,13 @@ by typing

    pip install Theano

-You may need to add ``sudo``  before this command to install into your
+This should work under Python 2 or Python 3. To test, run
+
+.. code-block:: bash
+
+    nosetests theano
+
+You may need to add ``sudo``  before the ``pip`` command to install into your
 system's ``site-packages`` directory. If you do not have administrator access
 to your machine, you can install Theano locally (to ~/.local) using

@@ -255,11 +261,6 @@ From here, the easiest way to get started is (this requires setuptools_ or distr
    cd Theano
    python setup.py develop

-.. note::
-
-   "python setup.py develop ..." does not work on Python 3 as it does not call
-   the converter from Python 2 code to Python 3 code.
-
 This will install a ``.pth`` file in your ``site-packages`` directory that
 tells Python where to look for your Theano installation (i.e. in the
 directory your just checked out of Github). Using ``develop`` mode is

--- a/doc/introduction.txt
+++ b/doc/introduction.txt
@@ -182,6 +182,7 @@ Here is the state of that vision as of December 3th, 2013 (after Theano release
  * Possible implementation note: allow Theano Variable in the fgraph to
    have more than 1 owner.

+* We support Python 2 and Python 3.
 * We have a CUDA backend for tensors of type `float32` only.
 * Efforts have begun towards a generic GPU ndarray (GPU tensor) (started in the
  `libgpuarray <https://github.com/Theano/libgpuarray>`_ project)

--- a/doc/scripts/docgen.py
+++ b/doc/scripts/docgen.py
@@ -131,7 +131,7 @@ if __name__ == '__main__':
        inopt = [docpath, workdir]
        if files is not None:
            inopt.extend(files)
-        sphinx.main(['', '-b', builder] + extraopts + inopt)
+        sphinx.build_main(['', '-b', builder] + extraopts + inopt)

    if options['--all'] or options['--rst']:
        mkdir("doc")

--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
 if (config.device.startswith('cuda') or
        config.device.startswith('opencl') or
        config.init_gpu_device.startswith('cuda') or
-        config.init_gpu_device.startswith('opencl')):
+        config.init_gpu_device.startswith('opencl') or
+        config.contexts != ''):
    import theano.sandbox.gpuarray

 # Use config.numpy to call numpy.seterr

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -580,7 +580,7 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
               print_view_map=False, order=None, ids='CHAR',
               stop_on_name=False, prefix_child=None,
               scan_ops=None, profile=None,
-               scan_inner_to_outer_inputs=None):
+               scan_inner_to_outer_inputs=None, smap=None):
    """
    Print the graph leading to `r` to given depth.

@@ -620,7 +620,8 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
    scan_inner_to_outer_inputs
        A dictionary mapping a scan ops inner function inputs to the scan op
        inputs (outer inputs) for printing purposes.
-
+    smap
+        None or the storage_map when printing an Theano function.
    """
    if depth == 0:
        return
@@ -689,23 +690,21 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
        already_printed = a in done  # get_id_str put it in the dict
        id_str = get_id_str(a)

-        if profile is None or a not in profile.apply_time:
        if len(a.outputs) == 1:
-                print('%s%s %s%s \'%s\' %s %s %s' % (prefix, a.op,
-                                                     id_str,
-                                                     type_str,
-                                                     r_name,
-                                                     destroy_map_str,
-                                                     view_map_str,
-                                                     o), file=file)
+            idx = ""
        else:
-                print('%s%s.%i %s%s \'%s\' %s %s %s' % (prefix, a.op,
-                                                        a.outputs.index(r),
+            idx = ".%i" % a.outputs.index(r)
+        data = ""
+        if smap:
+            data = " " + str(smap.get(a.outputs[0], ''))
+        if profile is None or a not in profile.apply_time:
+            print('%s%s%s %s%s \'%s\' %s %s %s%s' % (prefix, a.op,
+                                                     idx,
                                                     id_str, type_str,
                                                     r_name,
                                                     destroy_map_str,
                                                     view_map_str,
-                                                        o), file=file)
+                                                     o, data), file=file)
        else:
            op_time = profile.apply_time[a]
            op_time_percent = (op_time / profile.fct_call_time) * 100
@@ -714,28 +713,19 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
            tot_time_percent = (tot_time_dict[a] / profile.fct_call_time) * 100

            if len(a.outputs) == 1:
-                print("%s%s %s%s '%s' %s %s %s --> "
-                      "%8.2es %4.1f%% %8.2es %4.1f%%"
-                      % (prefix, a.op,
-                         id_str,
-                         type_str,
-                         r_name,
-                         destroy_map_str,
-                         view_map_str,
-                         o, op_time,
-                         op_time_percent,
-                         tot_time,
-                         tot_time_percent), file=file)
+                idx = ""
            else:
-                print("%s%s.%i %s%s '%s' %s %s %s --> "
+                idx = ".%i" % a.outputs.index(r)
+            print("%s%s%s %s%s '%s' %s %s %s%s --> "
                  "%8.2es %4.1f%% %8.2es %4.1f%%"
                  % (prefix, a.op,
-                         a.outputs.index(r),
+                     idx,
                     id_str, type_str,
                     r_name,
                     destroy_map_str,
                     view_map_str,
-                         o, op_time,
+                     o, data,
+                     op_time,
                     op_time_percent,
                     tot_time,
                     tot_time_percent), file=file)
@@ -761,7 +751,8 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
                        ids=ids, stop_on_name=stop_on_name,
                        prefix_child=new_prefix_child, scan_ops=scan_ops,
                        profile=profile,
-                        scan_inner_to_outer_inputs=scan_inner_to_outer_inputs)
+                        scan_inner_to_outer_inputs=scan_inner_to_outer_inputs,
+                        smap=smap)
    else:
        if scan_inner_to_outer_inputs is not None and\
           r in scan_inner_to_outer_inputs:
@@ -777,8 +768,13 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
                                       outer_id_str), file=file)
        else:
            # this is an input variable
+            data = ""
+            if smap:
+                data = " " + str(smap.get(r, ''))
            id_str = get_id_str(r)
-            print('%s%s %s%s' % (prefix, r, id_str, type_str), file=file)
+            print('%s%s %s%s%s' % (prefix, r, id_str,
+                                   type_str, data),
+                  file=file)

    return file


--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -91,6 +91,8 @@ exclude = []
 if not theano.config.cxx:
    exclude = ['cxx_only']
 OPT_NONE = gof.Query(include=[], exclude=exclude)
+# Even if multiple merge optimizer call will be there, this shouldn't
+# impact performance.
 OPT_MERGE = gof.Query(include=['merge'], exclude=exclude)
 OPT_FAST_RUN = gof.Query(include=['fast_run'], exclude=exclude)
 OPT_FAST_RUN_STABLE = OPT_FAST_RUN.requiring('stable')
@@ -113,7 +115,7 @@ OPT_STABILIZE.name = 'OPT_STABILIZE'
 predefined_optimizers = {
    None: OPT_NONE,
    'None': OPT_NONE,
-    'merge': gof.MergeOptimizer(),
+    'merge': OPT_MERGE,
    'fast_run': OPT_FAST_RUN,
    'fast_run_stable': OPT_FAST_RUN_STABLE,
    'fast_compile': OPT_FAST_COMPILE,

--- a/theano/compile/tests/test_mode.py
+++ b/theano/compile/tests/test_mode.py
@@ -25,3 +25,8 @@ def test_no_output_from_implace():
    fct_opt = theano.function([x, y], b, mode=mode_opt)
    op = fct_opt.maker.fgraph.outputs[0].owner.op
    assert (not hasattr(op, 'destroy_map') or 0 not in op.destroy_map)
+
+
+def test_including():
+    mode = theano.Mode(optimizer='merge')
+    mode.including('fast_compile')
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -111,6 +111,29 @@ AddConfigVar(
    BoolParam(False, allow_override=False),
    in_c_key=False)

+
+class ContextsParam(ConfigParam):
+    def __init__(self):
+        def filter(val):
+            if val == '':
+                return val
+            for v in val.split(';'):
+                s = v.split('->')
+                if len(s) != 2:
+                    raise ValueError("Malformed context map: %s" % (v,))
+            return val
+        ConfigParam.__init__(self, '', filter, False)
+
+AddConfigVar(
+    'contexts',
+    """
+    Context map for multi-gpu operation. Format is a
+    semicolon-separated list of names and device names in the
+    'name->dev_name' format. An example that would map name 'test' to
+    device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
+    "test->cuda0;test2->opencl0:0".
+    """, ContextsParam(), in_c_key=False)
+
 AddConfigVar(
    'print_active_device',
    "Print active device at when the GPU device is initialized.",

--- a/theano/d3viz/tests/test_formatting.py
+++ b/theano/d3viz/tests/test_formatting.py
@@ -32,7 +32,7 @@ class TestPyDotFormatter(unittest.TestCase):
        expected = 11
        if th.config.mode == "FAST_COMPILE":
            expected = 12
-        self.assertEqual(len(graph.get_nodes()), 12)
+        self.assertEqual(len(graph.get_nodes()), expected)
        nc = self.node_counts(graph)

        if th.config.mode == "FAST_COMPILE":

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -547,9 +547,7 @@ class CLinker(link.Linker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
-            return type(self)().accept(fgraph, no_recycling)
-            # raise Exception("Cannot accept from a Linker that is already"
-            #                " tied to another FunctionGraph.")
+            return type(self)(self.schedule).accept(fgraph, no_recycling)
        self.fgraph = fgraph
        self.fetch_variables()
        self.no_recycling = no_recycling
@@ -1284,6 +1282,34 @@ class CLinker(link.Linker):
                                 c_compiler=self.c_compiler(),
                                 )

+    def cmodule_key_variables(self, inputs, outputs, no_recycling,
+                              compile_args=None, libraries=None,
+                              header_dirs=None, insert_config_md5=True,
+                              c_compiler=None):
+
+        # Assemble a dummy fgraph using the provided inputs and outputs. It is
+        # only used to compute the cmodule key so it only need to expose an
+        # `inputs` and an `outputs` attribute as well as a toposort() method
+        # which returns a deterministic result.
+        class FakeFunctionGraph():
+            def __init__(self, inputs, outputs):
+                self.inputs = inputs
+                self.outputs = outputs
+
+            def toposort(self):
+                # Calling io_toposort() here is fine because the results will
+                # only be used to compute the cmodule key which requires that
+                # the result of the toposort be deterministic. The ordering
+                # doesn't need to include information about inplace operations
+                # because that information will be included explicitly in
+                # cmodule_key_().
+                return graph.io_toposort(self.inputs, self.outputs)
+
+        fgraph = FakeFunctionGraph(inputs, outputs)
+        return self.cmodule_key_(fgraph, no_recycling, compile_args,
+                                 libraries, header_dirs, insert_config_md5,
+                                 c_compiler)
+
    def cmodule_key_(self, fgraph, no_recycling, compile_args=None,
                     libraries=None, header_dirs=None, insert_config_md5=True,
                     c_compiler=None):
@@ -1425,8 +1451,15 @@ class CLinker(link.Linker):
            fgraph_computed_set.update(node.outputs)

        # Add not used input in the key
+        # If inputs don't define a 'clients' attribute (as is the case if
+        # fgraph is not a real FunctionGraph but a FakeFunctionGraph, a
+        # lightweight class designed to imitate FunctionGraph), pretend they
+        # have none. This if fine because the goal is only to have all of the
+        # graph's information used to compute the key. If we mistakenly
+        # pretend that inputs with clients don't have any, were are only using
+        # those inputs more than once to compute the key.
        for ipos, var in [(i, var) for i, var in enumerate(fgraph.inputs)
-                          if not len(var.clients)]:
+                          if not len(getattr(var, 'clients', []))]:
            sig.append((var.type, in_sig(var, -1, ipos)))

        # crystalize the signature and version
@@ -1720,7 +1753,8 @@ class OpWiseCLinker(link.LocalLinker):
            return type(self)(
                fallback_on_perform=self.fallback_on_perform,
                allow_gc=self.allow_gc,
-                nice_errors=self.nice_errors
+                nice_errors=self.nice_errors,
+                schedule=self.schedule,
            ).accept(fgraph, no_recycling)
            # raise Exception("Cannot accept from a Linker that is
            # already tied to another FunctionGraph.")
@@ -1873,7 +1907,8 @@ class DualLinker(link.Linker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
-            return type(self)(self.checker).accept(fgraph, no_recycling)
+            return type(self)(self.checker, self.schedule).accept(
+                fgraph, no_recycling)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -17,6 +17,7 @@ import tempfile
 import time
 import platform
 import distutils.sysconfig
+import warnings

 import numpy.distutils  # TODO: TensorType should handle this

@@ -324,6 +325,9 @@ def dlimport(fullpath, suffix=None):
            if hasattr(importlib, "invalidate_caches"):
                importlib.invalidate_caches()
        t0 = time.time()
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore",
+                                    message="numpy.ndarray size changed")
            rval = __import__(module_name, {}, {}, [module_name])
        t1 = time.time()
        import_time += t1 - t0

--- a/theano/misc/check_multi_gpu.py
+++ b/theano/misc/check_multi_gpu.py
+#! /usr/bin/env python
+"""
+This file compare the runtime of two independent dot products on one
+and two GPU to measure the speedup.
+
+This should be 2x if the GPUs are equivalent.
+"""
+import time
+
+import numpy
+
+import theano
+from theano.sandbox.gpuarray import init_dev
+from theano.sandbox.gpuarray.type import gpuarray_shared_constructor as shared
+from theano.sandbox.gpuarray.blas import gpu_dot22
+
+
+def main(dev1, dev2):
+    init_dev(dev1, 'ctx1')
+    init_dev(dev2, 'ctx2')
+
+    val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+    val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx1')
+
+    val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx2')
+    val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
+                   context_name='ctx2')
+
+    f1 = theano.function([], [gpu_dot22(val1a, val1b),
+                              gpu_dot22(val1c, val1d)])
+    f2 = theano.function([], [gpu_dot22(val1a, val1b),
+                              gpu_dot22(val2a, val2b)])
+
+    r = f1()
+    r[0].sync(), r[1].sync()
+    r = None
+    t = time.time()
+    r = f1()
+    r[0].sync(), r[1].sync()
+    t2 = time.time()
+    r = None
+
+    print("one ctx %f" % (t2 - t,))
+
+    r = f2()
+    r[0].sync(), r[1].sync()
+    r = None
+    t = time.time()
+    r = f2()
+    r[0].sync(), r[1].sync()
+    t2 = time.time()
+    r = None
+
+    print("two ctx %f" % (t2 - t,))
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 3:
+        raise ValueError("This script require two device names.")
+    main(sys.argv[1], sys.argv[2])
--- a/theano/printing.py
+++ b/theano/printing.py
@@ -48,7 +48,7 @@ VALID_ASSOC = set(['left', 'right', 'either'])

 def debugprint(obj, depth=-1, print_type=False,
               file=None, ids='CHAR', stop_on_name=False,
-               done=None):
+               done=None, print_storage=False):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
@@ -70,6 +70,10 @@ def debugprint(obj, depth=-1, print_type=False,
    :type done: None or dict
    :param done: A dict where we store the ids of printed node.
        Useful to have multiple call to debugprint share the same ids.
+    :type print_storage: bool
+    :param print_storage: If True, this will print the storage map
+        for Theano functions. Combined with allow_gc=False, after the
+        execution of a Theano function, we see the intermediate result.

    :returns: string if `file` == 'str', else file arg

@@ -101,7 +105,8 @@ def debugprint(obj, depth=-1, print_type=False,
        done = dict()
    results_to_print = []
    profile_list = []
-    order = []
+    order = []  # Toposort
+    smap = []  # storage_map
    if isinstance(obj, (list, tuple, set)):
        lobj = obj
    else:
@@ -110,24 +115,41 @@ def debugprint(obj, depth=-1, print_type=False,
        if isinstance(obj, gof.Variable):
            results_to_print.append(obj)
            profile_list.append(None)
+            smap.append(None)
+            order.append(None)
        elif isinstance(obj, gof.Apply):
            results_to_print.extend(obj.outputs)
            profile_list.extend([None for item in obj.outputs])
+            smap.extend([None for item in obj.outputs])
+            order.extend([None for item in obj.outputs])
        elif isinstance(obj, Function):
            results_to_print.extend(obj.maker.fgraph.outputs)
            profile_list.extend(
                [obj.profile for item in obj.maker.fgraph.outputs])
-            order = obj.maker.fgraph.toposort()
+            if print_storage:
+                smap.extend(
+                    [obj.fn.storage_map for item in obj.maker.fgraph.outputs])
+            else:
+                smap.extend(
+                    [None for item in obj.maker.fgraph.outputs])
+            topo = obj.maker.fgraph.toposort()
+            order.extend(
+                [topo for item in obj.maker.fgraph.outputs])
        elif isinstance(obj, gof.FunctionGraph):
            results_to_print.extend(obj.outputs)
            profile_list.extend([getattr(obj, 'profile', None)
                                 for item in obj.outputs])
-            order = obj.toposort()
+            smap.extend([getattr(obj, 'storage_map', None)
+                         for item in obj.outputs])
+            topo = obj.toposort()
+            order.extend([topo for item in obj.outputs])
        elif isinstance(obj, (integer_types, float, np.ndarray)):
            print(obj)
        elif isinstance(obj, (theano.In, theano.Out)):
            results_to_print.append(obj.variable)
            profile_list.append(None)
+            smap.append(None)
+            order.append(None)
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)
@@ -152,16 +174,16 @@ N.B.:
  to remove when optimizing a graph because their <total time> is very low.
 """, file=_file)

-    for r, p in zip(results_to_print, profile_list):
+    for r, p, s, o in zip(results_to_print, profile_list, smap, order):
        # Add the parent scan op to the list as well
        if (hasattr(r.owner, 'op') and
                isinstance(r.owner.op, theano.scan_module.scan_op.Scan)):
                    scan_ops.append(r)

        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
-                             file=_file, order=order, ids=ids,
+                             file=_file, order=o, ids=ids,
                             scan_ops=scan_ops, stop_on_name=stop_on_name,
-                             profile=p)
+                             profile=p, smap=s)

    if len(scan_ops) > 0:
        print("", file=_file)
@@ -996,7 +1018,11 @@ def pydotprint(fct, outfile=None,
            else:
                new_name = basename + '_' + str(idx)
            new_name = os.path.join(path, new_name + ext)
-            pydotprint(scan_op.op.fn, new_name, compact, format, with_ids,
+            if hasattr(scan_op.op, 'fn'):
+                to_print = scan_op.op.fn
+            else:
+                to_print = scan_op.op.outputs
+            pydotprint(to_print, new_name, compact, format, with_ids,
                       high_contrast, cond_highlight, colorCodes,
                       max_label_size, scan_graphs)


--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, tensor.TensorType):
-            return [gpu_from_host(ev)]
-        else:
-            return [ev]
+        return self(ev)

    def infer_shape(self, node, xshp):
        return xshp
@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isinstance(ev, CudaNdarrayType):
-            return [host_from_gpu(ev)]
-        else:
-            return [ev]
+        self(ev)

    def infer_shape(self, node, xshp):
        return xshp

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -142,8 +142,8 @@ void * device_malloc(size_t size, int verbose)
        status = cnmemMalloc(&rval, size, NULL);
        if(status != CNMEM_STATUS_SUCCESS) {
            PyErr_Format(PyExc_MemoryError,
-                         "Error allocating %zd bytes of device memory (%s).",
-                         size, cnmemGetErrorString(status));
+                         "Error allocating %llu bytes of device memory (%s).",
+                         (unsigned long long)size, cnmemGetErrorString(status));
            return NULL;
        }
    }
@@ -168,21 +168,21 @@ void * device_malloc(size_t size, int verbose)
                }
                #if COMPUTE_GPU_MEM_USED
                    fprintf(stderr,
-                            "Error allocating %zd bytes of device memory (%s)."
-                            " new total bytes allocated: %d."
-                            " Driver report %zd bytes free and %zd bytes total \n",
-                            size, cudaGetErrorString(err), _allocated_size,
-                            free, total);
+                            "Error allocating %llu bytes of device memory (%s)."
+                            " new total bytes allocated: %llu."
+                            " Driver report %llu bytes free and %llu bytes total \n",
+                            (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)_allocated_size,
+                            (unsigned long long)free, (unsigned long long)total);
                #else
                    fprintf(stderr,
-                            "Error allocating %zd bytes of device memory (%s)."
-                            " Driver report %zd bytes free and %zd bytes total \n",
-                            size, cudaGetErrorString(err), free, total);
+                            "Error allocating %llu bytes of device memory (%s)."
+                            " Driver report %llu bytes free and %llu bytes total \n",
+                            (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
                #endif
            }
            PyErr_Format(PyExc_MemoryError,
-                         "Error allocating %zd bytes of device memory (%s).",
-                         size, cudaGetErrorString(err));
+                         "Error allocating %llu bytes of device memory (%s).",
+                         (unsigned long long)size, cudaGetErrorString(err));
            return NULL;
        }
    }
@@ -310,17 +310,17 @@ int device_free(void *ptr)
                    }
                assert(i<TABLE_SIZE);
                fprintf(stderr,
-                        "Error freeing device pointer %p (%s) of size %d. %zd byte already allocated."
-                        " Driver report %zd bytes free and %zd bytes total \n",
+                        "Error freeing device pointer %p (%s) of size %llu. %llu byte already allocated."
+                        " Driver report %llu bytes free and %llu bytes total \n",
                        ptr, cudaGetErrorString(err),
-                        _alloc_size_table[i].size, _allocated_size, free, total);
+                        (unsigned long long)_alloc_size_table[i].size, (unsigned long long)_allocated_size, (unsigned long long)free, (unsigned long long)total);
            }
            #else
                fprintf(stderr,
                        "Error freeing device pointer %p (%s)."
-                        " Driver report %zd bytes free and %zd bytes total \n",
+                        " Driver report %llu bytes free and %llu bytes total \n",
                        ptr,
-                        cudaGetErrorString(err), free, total);
+                        cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
            #endif
            if (NULL != PyErr_Occurred()){
                fprintf(stderr,

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1765,10 +1765,6 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
        bottom borders.
        pad_w is the number of zero-valued pixels added to each of the left
        and right borders.
-    nd
-    	Number of dimensions of pooling, can be 2 or 3 for 2d or 3d pooling
-        If set to 3 all other params (except mode) must have an extra
-        dimension to match. 3 is only available for cudnn v3

    .. warning:: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
    return False


+def typeInfer(node):
+    return typeConstructor
+
 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
+               scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
                                             gpu_flag=True),
               75,
               'gpu',

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -279,8 +279,7 @@ def test_pooling():
                    a = f1(data).__array__()

                    b = f2(data).__array__()
-                    assert numpy.allclose(a, b,
-                                          atol=numpy.finfo(numpy.float32).eps)
+                    utt.assert_allclose(a, b)

        # Test the grad
        for shp in [(1, 1, 2, 2),
@@ -338,7 +337,7 @@ def test_pooling():
                assert any([isinstance(node.op, AveragePoolGrad)
                            for node in fc.maker.fgraph.toposort()])
            c_out = fc(data)
-            assert numpy.allclose(c_out, g_out)
+            utt.assert_allclose(c_out, g_out)


 def test_pooling3d():
@@ -443,7 +442,7 @@ def test_pooling3d():
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            c_out = fc(data)
-            assert numpy.allclose(c_out, g_out)
+            utt.assert_allclose(c_out, g_out)


 def test_pooling_opt():
@@ -1357,8 +1356,10 @@ def test_conv3d_bwd():
        # Compare the results of the two implementations
        res_ref = f_ref()
        res = f()
-        utt.assert_allclose(res_ref[0], res[0])
-        utt.assert_allclose(res_ref[1], res[1])
+        # Needed for big size for some seed
+        # raise rtol to make the test pass with more seed.
+        utt.assert_allclose(res_ref[0], res[0], rtol=2e-5)
+        utt.assert_allclose(res_ref[1], res[1], rtol=2e-5)

    test_cases = get_conv3d_test_cases()
    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -21,26 +21,30 @@ except ImportError:

 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
-                  GpuArraySharedVariable, gpuarray_shared_constructor)
+                   GpuArraySharedVariable, gpuarray_shared_constructor,
+                   reg_context)
 from . import opt, nerv


-def init_dev(dev):
+def init_dev(dev, name=None):
    if pygpu.gpuarray.api_version() != (-10000, 0):
        raise RuntimeError("Wrong API version for gpuarray:",
                           pygpu.gpuarray.api_version(),
                           "Make sure Theano and libgpuarray/pygpu "
                           "are in sync.")
    global pygpu_activated
-    context = pygpu.init(dev)
-    pygpu.set_default_context(context)
+    if dev not in init_dev.devmap:
+        init_dev.devmap[dev] = pygpu.init(dev)
+    context = init_dev.devmap[dev]
+    # This will map the context name to the real context object.
+    reg_context(name, context)
    pygpu_activated = True
    if config.print_active_device:
-        print("Using device %s: %s" % (dev, context.devname), file=sys.stderr)
-    # remember the active device
-    init_dev.device = dev
+        print("Mapped name %s to device %s: %s" % (name, dev, context.devname),
+              file=sys.stderr)

-init_dev.device = None
+# This maps things like 'cuda0' to the context object on that device.
+init_dev.devmap = {}

 if pygpu:
    try:
@@ -52,11 +56,21 @@ if pygpu:
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
        elif (config.init_gpu_device.startswith('cuda') or
              config.init_gpu_device.startswith('opencl')):
+            if config.device != 'cpu':
+                raise ValueError('you must set device=cpu to use init_gpu_device.')
+            if config.contexts != '':
+                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
            init_dev(config.init_gpu_device)
+        if config.contexts != '':
+            for n, d in (c.split('->') for c in config.contexts.split(';')):
+                init_dev(d.strip(), n.strip())
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')

        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
-        from .basic_ops import host_from_gpu, gpu_from_host
+        from .basic_ops import host_from_gpu, GpuFromHost
        from .elemwise import GpuElemwise
        from .subtensor import (GpuSubtensor, GpuIncSubtensor,
                                GpuAdvancedIncSubtensor1)
@@ -67,5 +81,6 @@ else:
    if (config.init_gpu_device.startswith('cuda') or
            config.init_gpu_device.startswith('opencl') or
            config.device.startswith('opencl') or
-            config.device.startswith('cuda')):
+            config.device.startswith('cuda') or
+            config.contexts != ''):
        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 import os.path

-from theano import Apply, config
+from theano import Apply, config, Op

 from theano.compile import optdb
-from theano.gof import local_optimizer, LocalOptGroup
+from theano.gof import LocalOptGroup
 from theano.tensor.basic import as_tensor_variable
-from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.tensor.opt import in2out

-from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
+from .basic_ops import as_gpuarray_variable, infer_context_name
+
+from .opt_util import inplace_allocempty

 try:
    import pygpu
@@ -18,7 +19,7 @@ except ImportError as e:
    pass


-class BlasOp(HideC):
+class BlasOp(Op):
    def c_headers(self):
        return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']

@@ -28,34 +29,27 @@ class BlasOp(HideC):
    def c_init_code(self):
        return ['import_pygpu__blas();']

-    def c_support_code(self):
-        return """
-PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
-                                   PyGpuArrayObject *y) {
-  if (out &&
-      GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
-      theano_size_check(out, PyGpuArray_NDIM(y),
-                        PyGpuArray_DIMS(y),
-                        y->ga.typecode)) {
-    if (pygpu_move(out, y)) {
-      Py_XDECREF(out);
-      return NULL;
-    }
-  } else {
-    Py_XDECREF(out);
-    out = pygpu_copy(y, GA_ANY_ORDER);
-  }
-  return out;
-}
-"""

+class GpuGemv(BlasOp):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}

-class GpuGemv(BlasOp, Gemv):
    def make_node(self, y, alpha, A, x, beta):
-        Gemv.make_node(self, y, alpha, A, x, beta)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(y, A, x)
+        A = as_gpuarray_variable(A, ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 2
+        assert x.ndim == 1
+        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [y, alpha, A, x, beta], [y.type()])

@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
        if self.inplace:
            code = """
                   if (%(y)s->ga.strides[0] <= 0) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(y)s);
+                     %(out)s = theano_try_copy(%(out)s, %(y)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(y)s);
+                   %(out)s = theano_try_copy(%(out)s, %(y)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
        return code

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)


-class GpuGemm(BlasOp, Gemm):
+class GpuGemm(BlasOp):
+    __props__ = ('inplace',)
    _f16_ok = True

+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, C, alpha, A, B, beta):
+        ctx_name = infer_context_name(C, A, B)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)
        alpha = as_tensor_variable(alpha)
        beta = as_tensor_variable(beta)
-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
+        assert alpha.ndim == 0
+        assert beta.ndim == 0
+        assert A.ndim == 2
+        assert B.ndim == 2
+        assert C.ndim == 2
        assert A.dtype == B.dtype == C.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
        if self.inplace:
            code = """
                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(C)s);
+                     %(out)s = theano_try_copy(%(out)s, %(C)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(C)s);
+                   %(out)s = theano_try_copy(%(out)s, %(C)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
        return code

    def c_code_cache_version(self):
-        return (4,)
-
+        return (5,)

 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)


-class GpuGer(BlasOp, Ger):
+class GpuGer(BlasOp):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def make_node(self, A, alpha, x, y):
-        Ger.make_node(self, A, alpha, x, y)
-        A = as_gpuarray_variable(A)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(A, x, y)
+        A = as_gpuarray_variable(A, ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        alpha = as_tensor_variable(alpha)
+        assert alpha.ndim == 0
+        assert A.ndim == 2
+        assert x.ndim == 1
+        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [A, alpha, x, y], [A.type()])

    def perform(self, node, inp, out):
        A, alpha, x, y = inp
-        inplace = self.destructive
+        inplace = self.inplace
        if inplace and not A.flags.forc:
            inplace = False
        out[0][0] = blas.ger(alpha, x, y, A,
@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
                    fail=sub['fail'], name=name)
-        if self.destructive:
+        if self.inplace:
            code = """
                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
-                     %(out)s = gpublas_try_copy(%(out)s, %(A)s);
+                     %(out)s = theano_try_copy(%(out)s, %(A)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
                   """ % vars
        else:
            code = """
-                   %(out)s = gpublas_try_copy(%(out)s, %(A)s);
+                   %(out)s = theano_try_copy(%(out)s, %(A)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)


-gpuger_no_inplace = GpuGer(destructive=False)
-gpuger_inplace = GpuGer(destructive=True)
+gpuger_no_inplace = GpuGer(inplace=False)
+gpuger_inplace = GpuGer(inplace=True)


-class GpuDot22(BlasOp, Dot22):
+class GpuDot22(BlasOp):
+    __props__ = ()
+
    def make_node(self, x, y):
-        Dot22.make_node(self, x, y)
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        assert x.ndim == 2
+        assert y.ndim == 2
        assert x.dtype == y.dtype
        return Apply(self, [x, y], [x.type()])

@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
        dims[1] = PyGpuArray_DIMS(%(B)s)[1];

        if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
-                              pygpu_default_context())) {
+                               %(A)s->context)) {
            %(fail)s
        }

@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
        return code

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

 gpu_dot22 = GpuDot22()


-@local_optimizer([gpugemv_no_inplace], inplace=True)
-def local_inplace_gpuagemv(node):
-    if node.op == gpugemv_no_inplace:
-        return [gpugemv_inplace(*node.inputs)]
+@inplace_allocempty(GpuGemv, 0)
+def local_inplace_gpuagemv(node, inputs):
+    return [gpugemv_inplace(*inputs)]


-@local_optimizer([gpugemm_no_inplace], inplace=True)
-def local_inplace_gpuagemm(node):
-    if node.op == gpugemm_no_inplace:
-        inputs = list(node.inputs)
-        C = inputs[0]
-        if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
-                len(C.clients) > 1):
-            inputs[0] = C.owner.op(*C.owner.inputs)
+@inplace_allocempty(GpuGemm, 0)
+def local_inplace_gpuagemm(node, inputs):
    return [gpugemm_inplace(*inputs)]


-@local_optimizer([gpuger_no_inplace], inplace=True)
-def local_inplace_gpuager(node):
-    if node.op == gpuger_no_inplace:
-        return [gpuger_inplace(*node.inputs)]
+@inplace_allocempty(GpuGer, 0)
+def local_inplace_gpuager(node, inputs):
+    return [gpuger_inplace(*inputs)]

 gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
                                            local_inplace_gpuagemm,

--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
 import copy
 import os

-import theano
-from theano import config, gof
+from theano import gof

 try:
    from pygpu import gpuarray
@@ -10,7 +9,8 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from theano.gof import utils


@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
        them.

    """
+    __props__ = ('border_mode', 'subsample', 'logical_img_hw',
+                 'logical_kern_hw', 'logical_kern_align_top', 'version',
+                 'verbose', 'kshp', 'imshp', 'max_threads_dim0')

    @staticmethod
    def logical_output_shape_2d(imshp, kshp, mode):
@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
        raise ValueError(mode)

-    def __init__(self, border_mode,
-                 subsample=(1, 1),
-                 logical_img_hw=None,
-                 logical_kern_hw=None,
+    def __init__(self, border_mode, subsample=(1, 1),
+                 logical_img_hw=None, logical_kern_hw=None,
                 logical_kern_align_top=True,
-                 version=-1,
-                 direction_hint=None,
-                 verbose=0,
-                 kshp=None,
-                 imshp=None,
+                 version=-1, direction_hint=None,
+                 verbose=0, kshp=None, imshp=None,
                 max_threads_dim0=None,
-                 nkern=None,
-                 bsize=None,
-                 fft_opt=True):
+                 nkern=None, bsize=None, fft_opt=True):
        self.border_mode = border_mode
        self.subsample = subsample
        if logical_img_hw is not None:
@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        self.bsize = bsize
        self.fft_opt = fft_opt

-    def __eq__(self, other):
-        return type(self) == type(other) \
-            and self.border_mode == other.border_mode \
-            and self.subsample == other.subsample \
-            and self.logical_img_hw == other.logical_img_hw \
-            and self.logical_kern_hw == other.logical_kern_hw \
-            and self.logical_kern_align_top == other.logical_kern_align_top \
-            and self.version == other.version \
-            and self.verbose == other.verbose \
-            and self.kshp == other.kshp\
-            and self.imshp == other.imshp\
-            and self.max_threads_dim0 == other.max_threads_dim0
-
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, "imshp"):
@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        if not hasattr(self, "fft_opt"):
            self.fft_opt = True

-    def __hash__(self):
-        # don't use hash(self.version) as hash(-1)==-2 and
-        # hash(-2)==-2 in python!
-        return hash(type(self)) \
-            ^ hash(self.border_mode) \
-            ^ hash(self.subsample) \
-            ^ hash(self.logical_img_hw) \
-            ^ hash(self.logical_kern_hw) \
-            ^ hash(self.logical_kern_align_top) \
-            ^ self.version \
-            ^ hash(self.verbose) \
-            ^ hash(self.kshp)\
-            ^ hash(self.imshp)\
-            ^ hash(self.max_threads_dim0)
-
-    def __str__(self):
-        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            str(self.logical_img_hw),
-            str(self.logical_kern_hw),
-            str(self.logical_kern_align_top),
-            str(self.imshp),
-            str(self.kshp))
-
    def make_node(self, img, kern):
        if img.dtype != "float32" or kern.dtype != "float32":
            raise NotImplementedError("GpuConv currently only work"
@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
            raise TypeError('img must be 4D tensor')
        if kern.type.ndim != 4:
            raise TypeError('kern must be 4D tensor')
-        img = as_gpuarray_variable(img)
-        kern = as_gpuarray_variable(kern)
+        ctx_name = infer_context_name(img, kern)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
-        out = GpuArrayType(img.dtype, broadcastable)()
+        out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
        return gof.Apply(self, [img, kern], [out])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def flops(self, inputs, outputs):
        """
        Useful with the hack in profilemode to print the MFlops.
@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        node_ = copy.copy(node)
        assert node.op is node_.op
-        if config.gpuarray.sync:
-            raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
        if node_.op.max_threads_dim0 is None:
-            cuda = theano.sandbox.cuda
-            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
-            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
-            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+            node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
                                                   compute_map, no_recycling)

@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 22)
+        return (0, 23)

    def c_code(self, node, nodename, inp, out_, sub):
+        if node.inputs[0].type.context.kind != "cuda":
+            raise NotImplementedError("GpuConv only works for cuda devices")
        img, kern = inp
        out, = out_
        dx = self.subsample[0]
@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
        """ % locals()
        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
-        kname = "conv_full_load_everything"
        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
        bin = gk._binary
        bcode = ','.join(hex(ord(c)) for c in bin)
@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
        static const char conv_bcode[] = {%(bcode)s};
        static const char *conv_code = "%(code)s";
        """ % locals()
-        for k in kernels:
-            mod += "static GpuKernel " + k.name + '_' + name + ";\n"
-        mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
+        return mod
+
+    def c_support_code_struct(self, node, name):
+        mod = GpuKernelBase.c_support_code_struct(self, node, name)
+        with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
+            mod += f.read()
        return mod

    @utils.memoize

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {

 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
 //bit, it is 32 bit.
-const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers

 __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
  if (nb_thread < 64)
@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
      if (thread_id < nb_thread)
        {
          const float * my_src_ptr = (const float *)(
-                  ((uintptr_t)src) & COALESCED_ALIGN);
+                  ((size_t)src) & COALESCED_ALIGN);
          my_src_ptr += thread_id;
          while (my_src_ptr < src + N)
          {

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
 #section init_code_struct

 {
-  cuda_enter(pygpu_default_context()->ctx);
+  cuda_enter(CONTEXT->ctx);
  cudnnStatus_t err;
  APPLY_SPECIFIC(_handle) = NULL;
  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
                 cudnnGetErrorString(err));
-    cuda_exit(pygpu_default_context()->ctx);
+    cuda_exit(CONTEXT->ctx);
    FAIL;
  }
-  cuda_exit(pygpu_default_context()->ctx);
+  cuda_exit(CONTEXT->ctx);
 }
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
-                         PyGpuArrayObject **output) {
+                         PyGpuArrayObject **output,
+                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **input) {
+                        double alpha, double beta, PyGpuArrayObject **input,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **kerns) {
+                        double alpha, double beta, PyGpuArrayObject **kerns,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI

 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             cudnnPoolingDescriptor_t desc,
-                             PyGpuArrayObject **out) {
+                             PyGpuArrayObject **out,
+                             PyGpuContextObject *c) {
  cudnnStatus_t err;
  size_t dims[5];
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
                                  cudnnPoolingDescriptor_t desc,
-                                  PyGpuArrayObject **inp_grad) {
+                                  PyGpuArrayObject **inp_grad,
+                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,

  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
                         PyGpuArray_DIMS(inp), inp->ga.typecode,
-                         GA_C_ORDER, pygpu_default_context()) != 0) {
+                         GA_C_ORDER, c) != 0) {
    return 1;
  }


--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
 #section support_code_struct

 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
-                            PyGpuArrayObject **out) {
+                            PyGpuArrayObject **out,
+                            PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)

 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
-                                 PyGpuArrayObject **dx) {
+                                 PyGpuArrayObject **dx,
+                                 PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
@@ -2,7 +2,7 @@

 /* Why do we need this? */
 size_t dim = 2048 * 32;
-rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
+rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
                       Py_None);
 if (rand_buf == NULL) {
  FAIL;
@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;

 int gemm16(PyGpuArrayObject *C, float alpha,
           PyGpuArrayObject *A, PyGpuArrayObject *B,
-           float beta, PyGpuArrayObject **out) {
+           float beta, PyGpuArrayObject **out,
+           PyGpuContextObject *c) {
  PyGpuArrayObject *_A = NULL;
  PyGpuArrayObject *_B = NULL;
  GpuKernel *gk;

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
@@ -10,7 +10,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType

@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        self.mode = mode

    def make_node(self, ten4, neib_shape, neib_step):
-        ten4 = as_gpuarray_variable(ten4)
+        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
        neib_shape = T.as_tensor_variable(neib_shape)
        neib_step = T.as_tensor_variable(neib_step)

@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):

        return Apply(self, [ten4, neib_shape, neib_step],
                     [GpuArrayType(broadcastable=(False, False),
-                                   dtype=ten4.type.dtype)()])
+                                   dtype=ten4.type.dtype,
+                                   context_name=ten4.type.context_name)()])
+
+    def get_context(self, node):
+        return node.inputs[0].type.context

    def c_code_cache_version(self):
        return (11,)
@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
-//a version that use less register but don't work in all case.
+// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        return kernels

    def c_code(self, node, name, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        dtype_ten4 = node.inputs[0].dtype
        dtype_neib_shape = node.inputs[1].dtype
        dtype_neib_step = node.inputs[2].dtype
@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        ten4, neib_shape, neib_step = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        mode = self.mode
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                dims[0] = z_dim0;
                dims[1] = z_dim1;
                %(z)s = pygpu_empty(2, dims, %(typecode_z)s,
-                                    GA_C_ORDER, pygpu_default_context(),
-                                    Py_None);
+                                    GA_C_ORDER, %(ctx)s, Py_None);
                if (!%(z)s)
                {
                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):


 @op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node):
+def use_gpu_images2neibs(node, context_name):
    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
        return GpuImages2Neibs(node.op.mode)


--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
 from theano.scalar import as_scalar, constant

 from . import opt
-from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
+from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
+                        infer_context_name)
+from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge
-from .pycuda_helper import ensure_pycuda_context
-

 try:
    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
@@ -43,6 +43,7 @@ def ensure_float(val, name):
 class Gemm16(COp):
    __props__ = ('relu', 'inplace')
    _f16_ok = True
+    context_type = gpu_context_type
    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
                  'tn_128x128', 'tn_128x64', 'tn_128x32',
@@ -61,10 +62,11 @@ class Gemm16(COp):
    def make_node(self, C, alpha, A, B, beta):
        if GPUTensor is None:
            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
+        ctx_name = infer_context_name(C, A, B)

-        A = as_gpuarray_variable(A)
-        B = as_gpuarray_variable(B)
-        C = as_gpuarray_variable(C)
+        A = as_gpuarray_variable(A, ctx_name)
+        B = as_gpuarray_variable(B, ctx_name)
+        C = as_gpuarray_variable(C, ctx_name)

        alpha = ensure_float(alpha, 'alpha')
        beta = ensure_float(beta, 'beta')
@@ -73,27 +75,8 @@ class Gemm16(COp):

        return Apply(self, [C, alpha, A, B, beta], [C.type()])

-    def perform(self, node, inputs, outputs):
-        ensure_pycuda_context()
-        C, alpha, A, B, beta = inputs
-        # The nervana code does not support the case where both inputs
-        # are trans, so we need to copy one if them if that is the
-        # case. We copy the smaller one.
-        if A.flags.f_contiguous and B.flags.f_contiguous:
-            if A.size < B.size:
-                A = A.copy()
-            else:
-                B = B.copy()
-        inplace = self.inplace
-        if inplace and not C.flags.c_contiguous:
-            inplace = False
-        if not inplace:
-            C = C.copy()
-        At = to_gputensor(A)
-        Bt = to_gputensor(B)
-        Ct = to_gputensor(C)
-        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
-        outputs[0][0] = C
+    def get_context(self, node):
+        return node.inputs[0].type.context

    def c_headers(self):
        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
        codel.append("const char *bcode;")
        codel.append("size_t sz;")
-        codel.append("PyGpuContextObject *c = pygpu_default_context();")
+        codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,

 @opt.register_opt()
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node):
+def local_dot_to_gemm16(node, ctx_name):
    if nerv is None:
        return
    A = node.inputs[0]
@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = node.inputs[0].fgraph
-        C = GpuAllocEmpty(dtype='float16')(
+        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -10,7 +10,8 @@ try:
 except ImportError:
    pass

-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
                             inline_softmax,
@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.

    """
-
    nin = 3
    nout = 3
    __props__ = ()
    _f16_ok = True

    def make_node(self, x, b, y_idx):
-        # N.B. won't work when we don't cast y_idx to float anymore
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
-        y_idx = as_gpuarray_variable(y_idx)
+        ctx_name = infer_context_name(x, b, y_idx)
+        x = as_gpuarray_variable(x, ctx_name)
+        b = as_gpuarray_variable(b, ctx_name)
+        y_idx = as_gpuarray_variable(y_idx, ctx_name)
        nll = GpuArrayType(x.type.dtype,
-                           y_idx.type.broadcastable)()
+                           y_idx.type.broadcastable,
+                           context_name=ctx_name)()
        sm = x.type()
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                       flags=flags, objvar=k_var)]

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError('cuda only')
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
+        ctx = sub['context']
        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_x)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_x)s, GA_C_ORDER, %(ctx)s,
+                                Py_None);
            if (!%(nll)s) {
                %(fail)s
            }
@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode_b)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_b)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        {
            Py_XDECREF(%(am)s);
            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_y_idx)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode_y_idx)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
    Gradient wrt x of the CrossentropySoftmax1Hot Op.

    """
-
    nin = 3
    nout = 1
    __props__ = ()
    _f16_ok = True

    def make_node(self, dnll, sm, y_idx):
-        dnll = as_gpuarray_variable(dnll)
-        sm = as_gpuarray_variable(sm)
-        y_idx = as_gpuarray_variable(y_idx)
+        ctx_name = infer_context_name(dnll, sm, y_idx)
+        dnll = as_gpuarray_variable(dnll, ctx_name)
+        sm = as_gpuarray_variable(sm, ctx_name)
+        y_idx = as_gpuarray_variable(y_idx, ctx_name)
        return Apply(self, [dnll, sm, y_idx], [sm.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def c_code_cache_version(self):
        return (11,)

@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
+        ctx = sub['context']
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        err_check = """
            if (err != GA_NO_ERROR) {
@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
-                                 %(typecode_dx)s,
-                                 GA_C_ORDER,
-                                 pygpu_default_context(), Py_None);
+                                 %(typecode_dx)s, GA_C_ORDER,
+                                 %(ctx)s, Py_None);
            if (!%(dx)s) {
                %(fail)s
            }
@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
    Implement Softmax on the gpu.

    """
-
    __props__ = ()
    _f16_ok = True

    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
        return Apply(self, [x], [x.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def infer_shape(self, node, shape):
        return shape

@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError("cuda only")
        dtype_x = node.inputs[0].dtype
        work_x = work_dtype(dtype_x)
        dtype_z = node.outputs[0].dtype
@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        x, = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }
@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
 gpu_softmax = GpuSoftmax()


-class GpuSoftmaxWithBias (GpuKernelBase, Op):
+class GpuSoftmaxWithBias(GpuKernelBase, Op):
    """
    Implement SoftmaxWithBias on the gpu.

    """
-
    nin = 2
    nout = 1
    __props__ = ()
    _f16_ok = True

    def make_node(self, x, b):
-        x = as_gpuarray_variable(x)
-        b = as_gpuarray_variable(b)
+        ctx_name = infer_context_name(x, b)
+        x = as_gpuarray_variable(x, ctx_name)
+        b = as_gpuarray_variable(b, ctx_name)
        return Apply(self, [x, b], [x.type()])

+    def get_context(self, node):
+        return node.inputs[0].type.context
+
    def infer_shape(self, node, shape):
        return [shape[0]]

@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
+        if node.inputs[0].type.context.kind != 'cuda':
+            raise NotImplementedError('cuda only')
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_z = node.outputs[0].dtype
@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        x, b = inp
        z, = out
        fail = sub['fail']
+        ctx = sub['context']
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s,
-                                GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(typecode)s, GA_C_ORDER,
+                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
    function can be as simple as:

        def maker(node, inputs):
-            return node.op.__class__(inplace=True)(*inputs)
+            return [node.op.__class__(inplace=True)(*inputs)]

    Parameters
    ----------
@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
+                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
+                                         alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/sandbox/gpuarray/pycuda_helper.py
+++ b/theano/sandbox/gpuarray/pycuda_helper.py
-try:
-    from pycuda.driver import Context
-    if not hasattr(Context, 'attach'):
-        raise ImportError('too old')
-except ImportError:
-    Context = None
-
-pycuda_initialized = False
-pycuda_context = None
-
-
-def ensure_pycuda_context():
-    global pycuda_context, pycuda_initialized
-    if not pycuda_initialized:
-        if Context is None:
-            raise RuntimeError("PyCUDA not found or too old.")
-        else:
-            pycuda_context = Context.attach()
-            import atexit
-            atexit.register(pycuda_context.detach)
-            pycuda_initialized = True
-    return pycuda_context
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 from __future__ import print_function

-import copy
 import os
+import copy

 import numpy

 import theano
-from theano import tensor, gof, config
-from theano.gof.utils import MethodNotDefined
+from theano import tensor, gof
 from six.moves import StringIO
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
@@ -19,7 +18,8 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
+                        infer_context_name)
 from .elemwise import GpuElemwise


@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
    _f16_ok = True

    def make_node(self, x, *inputs):
+        ctx_name = infer_context_name(x)
        rval = tensor.Subtensor.make_node(self, x, *inputs)
        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
-                             broadcastable=rval.outputs[0].type.broadcastable)
-        x = as_gpuarray_variable(x)
+                             broadcastable=rval.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])

    def perform(self, node, inputs, out_):
@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)

    def make_node(self, x, y, *inputs):
-        x = as_gpuarray_variable(x)
-        y = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        op = copy.copy(self)
        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
        op.create_iadd_node(ret)
        return ret

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
    def create_iadd_node(self, node):
        # We store a iadd_node in the op that contain the info needed
        # for the inplace add.
@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        iadd_node = gop(xview, y).owner
        self.iadd_node = iadd_node

-    def perform(self, node, inputs, out_):
+    def perform(self, node, inputs, out_, ctx):
        out, = out_
        x, y = inputs[:2]
        indices = list(reversed(inputs[2:]))
@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
                                  %(view_ndim)s,
                                  dims,
                                  xview_strides,
-                                  pygpu_default_context(),
+                                  %(x)s->context,
                                  1,
                                  (PyObject *)%(x)s,
                                  (PyObject *)&PyGpuArrayType);
@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()

-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        gop = self.iadd_node.op
        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
+        ret = gop.c_support_code_struct(self.iadd_node, sub_name)
        ret += """
        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
                                                    PyGpuArrayObject* src){
@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """ % locals()
        inputs = ["dst", "src"]
        outputs = ["ret"]
-        sub = {"fail": "return NULL;"}
+        sub = {"fail": "return NULL;", "context": "dst->context"}
        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
        ret += """
-            return dst;
+            return ret;
+
        }
        """
        return ret
@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):

 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
    def make_node(self, x, ilist):
-        x_ = as_gpuarray_variable(x)
+        ctx_name = infer_context_name(x, ilist)
+        x_ = as_gpuarray_variable(x, ctx_name)

        ilist__ = tensor.as_tensor_variable(ilist)
        if ilist__.type.dtype[:3] not in ('int', 'uin'):
@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
        if ilist__.type.dtype != 'int64':
            ilist__ = tensor.cast(ilist__, 'int64')

-        ilist_ = as_gpuarray_variable(ilist__)
+        ilist_ = as_gpuarray_variable(ilist__, ctx_name)

        if ilist_.type.dtype != 'int64':
            raise TypeError('index must be int64')
@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
        bcast = ilist_.broadcastable + x_.broadcastable[1:]
        return gof.Apply(self, [x_, ilist_],
                         [GpuArrayType(dtype=x.dtype,
+                                       context_name=ctx_name,
                                       broadcastable=bcast)()])

    def perform(self, node, inp, out_):
@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
    """

    def make_node(self, x, y, ilist):
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
+        ctx_name = infer_context_name(x, y)
+        x_ = as_gpuarray_variable(x, ctx_name)
+        y_ = as_gpuarray_variable(y, ctx_name)
        ilist_ = tensor.as_tensor_variable(ilist)

        assert x_.type.dtype == y_.type.dtype
@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
    only avail on compute capability 2.0 and more recent.

    """
-
    _f16_ok = True

    def make_node(self, x, y, ilist):
        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
        the index are of type long.
        """
-        x_ = as_gpuarray_variable(x)
-        y_ = as_gpuarray_variable(y)
-        ilist_ = as_gpuarray_variable(ilist)
+        ctx_name = infer_context_name(x, y, ilist)
+        x_ = as_gpuarray_variable(x, ctx_name)
+        y_ = as_gpuarray_variable(y, ctx_name)
+        ilist_ = as_gpuarray_variable(ilist, ctx_name)

        assert x_.type.dtype == y_.type.dtype
        assert x_.type.ndim >= y_.type.ndim
@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):

        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

+    def get_context(self, node):
+        return node.outputs[0].type.context
+
+    def perform(self, node, inp, out, ctx):
+        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
+
    def c_code_cache_version(self):
        return (6,)

    def c_headers(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<numpy_compat.h>', '<gpuarray_helper.h>',
+        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
                '<gpuarray/types.h>']

    def c_header_dirs(self):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
-        cuda_root = config.cuda.root
-        res = [os.path.dirname(__file__)]
-        if cuda_root:
-            res.append(os.path.join(cuda_root, 'include'))
-        return res
+        return [os.path.dirname(__file__)]

    def c_code(self, node, name, inputs, outputs, sub):
-        active_device_no = theano.sandbox.cuda.active_device_number()
-        device_properties = theano.sandbox.cuda.device_properties
-        compute_capability = device_properties(active_device_no)['major']
-        if ((self.set_instead_of_inc) or
-                (node.inputs[0].ndim != node.inputs[1].ndim) or
-                (node.inputs[0].ndim != 2) or
-                (compute_capability < 2)):
+        ctx = self.get_context(node)
+        if ctx.kind != 'cuda':
+            raise NotImplementedError("cuda only")
+        if (self.set_instead_of_inc or
+                node.inputs[0].ndim != node.inputs[1].ndim or
+                node.inputs[0].ndim != 2 or
+                ctx.bin_id[-2] < '2'):
            raise NotImplementedError("This case does not have C code yet.")

        x = inputs[0]
@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]

-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        itemsize_out = numpy.dtype(dtype_out).itemsize
        k_var = "k_vector_add_fast_" + nodename

-        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
+        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                     PyGpuArrayObject* py_other,
                                     PyGpuArrayObject *indices_arr)

--- a/theano/sandbox/gpuarray/tests/config.py
+++ b/theano/sandbox/gpuarray/tests/config.py
+from nose.plugins.skip import SkipTest
+
+import theano.sandbox.gpuarray
+
+if theano.sandbox.gpuarray.pygpu is None:
+    raise SkipTest("pygpu not installed")
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    import theano.sandbox.cuda as cuda_ndarray
+    if cuda_ndarray.cuda_available:
+        cuda_ndarray.use('gpu', default_to_move_computation_to_gpu=False,
+                         move_shared_float32_to_gpu=False,
+                         enable_cuda=False)
+        theano.sandbox.gpuarray.init_dev('cuda')
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+
+test_ctx_name = None
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
 from theano.tensor.tests import test_basic
 from theano.tensor.tests.test_basic import rand, safe_make_node
 from theano.tests import unittest_tools as utt
-from theano.tests.unittest_tools import SkipTest

-import theano.sandbox.gpuarray
-
-from ..type import (GpuArrayType,
+from ..type import (GpuArrayType, get_context,
                    gpuarray_shared_constructor)
 from ..basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
-    gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
+    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape,
+    GpuAlloc, GpuAllocEmpty, GpuContiguous,
    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
 from ..subtensor import GpuSubtensor

-import theano.sandbox.cuda as cuda_ndarray
-
-try:
-    from pygpu import gpuarray
-except:
-    pass
-
-if theano.sandbox.gpuarray.pygpu is None:
-    raise SkipTest("pygpu not installed")
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name

-# If you are writing a new test file, don't copy this code, but rather
-# import stuff from this file (like mode_with_gpu) to reuse it.
-if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
-    if not cuda_ndarray.use.device_number:
-        # We should not enable all the use like the flag device=gpu,
-        # as many tests don't work in that setup.
-        cuda_ndarray.use('gpu',
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False)
-    theano.sandbox.gpuarray.init_dev('cuda')
-
-if not theano.sandbox.gpuarray.pygpu_activated:
-    raise SkipTest("pygpu disabled")
+from pygpu import gpuarray

 utt.seed_rng()
 rng = numpy.random.RandomState(seed=utt.fetch_seed())

-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
-

 def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                 on_unused_input='raise', name=None):
@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
    cls = kwargs.pop('cls', None)
    if len(kwargs) != 0:
        raise TypeError('Unexpected argument %s', list(kwargs.keys())[0])
-    return gpuarray.array(r, dtype=dtype, cls=cls)
+    return gpuarray.array(r, dtype=dtype, cls=cls,
+                          context=get_context(test_ctx_name))


 def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,

        def test_all(self):
            if skip:
+                from nose.plugins.skip import SkipTest
                raise SkipTest(skip)

            for testname, inputs in iteritems(cases):
@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')

    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
-    gv = gpuarray.array(av)
+    gv = gpuarray.array(av, context=get_context(test_ctx_name))

-    f = theano.function([a], gpu_from_host(a))
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)

@@ -218,12 +189,12 @@ def test_transfer_strided():
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')

    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
-    gv = gpuarray.array(av)
+    gv = gpuarray.array(av, context=get_context(test_ctx_name))

    av = av[:, ::2]
    gv = gv[:, ::2]

-    f = theano.function([a], gpu_from_host(a))
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a))
    fv = f(av)
    assert GpuArrayType.values_eq(fv, gv)

@@ -233,14 +204,14 @@ def test_transfer_strided():


 def gpu_alloc_expected(x, *shp):
-    g = gpuarray.empty(shp, dtype=x.dtype)
+    g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
    g[:] = x
    return g

 GpuAllocTester = makeTester(
    name="GpuAllocTester",
    op=alloc,
-    gpu_op=gpu_alloc,
+    gpu_op=GpuAlloc(test_ctx_name),
    cases=dict(
        correct01=(rand(), numpy.int32(7)),
        # just gives a DeepCopyOp with possibly wrong results on the CPU
@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
    dtype = "float32"
    mode = mode_with_gpu
    shared = staticmethod(gpuarray_shared_constructor)
-    allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()]
+    allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), T.Alloc()]


 def test_alloc_empty():
    for dt in ['float32', 'int8']:
-        f = theano.function([], GpuAllocEmpty(dt)(2, 3))
+        f = theano.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
        assert len(f.maker.fgraph.apply_nodes) == 1
        out = f()
        assert out.shape == (2, 3)
        assert out.dtype == dt

-    f = theano.function([], [GpuAllocEmpty('uint64')(3, 2),
-                             GpuAllocEmpty('uint64')(3, 2)])
+    f = theano.function([], [GpuAllocEmpty('uint64', test_ctx_name)(3, 2),
+                             GpuAllocEmpty('uint64', test_ctx_name)(3, 2)])
    out = f()
    assert out[0].shape == (3, 2)
    assert out[0].dtype == 'uint64'
@@ -284,7 +255,7 @@ def test_alloc_empty():

 def test_shape():
    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
-    v = gpuarray.zeros((3, 4, 5), dtype='float32')
+    v = gpuarray.zeros((3, 4, 5), dtype='float32', context=get_context(test_ctx_name))
    f = theano.function([x], x.shape)
    topo = f.maker.fgraph.toposort()
    assert numpy.all(f(v) == (3, 4, 5))
@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
    cv = gpuarray.asarray(numpy.random.rand(5, 4),
-                          dtype='float32')
+                          dtype='float32',
+                          context=get_context(test_ctx_name))

-    f = theano.function([a], gpu_from_host(a), mode=m)
-    assert gpu_from_host in [x.op
-                             for x in f.maker.fgraph.toposort()]
-    f = theano.function([a], gpu_from_host(a).shape, mode=m)
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
+    assert any(isinstance(x.op, GpuFromHost)
+               for x in f.maker.fgraph.toposort())
+    f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
    topo = f.maker.fgraph.toposort()
    assert isinstance(topo[0].op, T.opt.Shape_i)
    assert isinstance(topo[1].op, T.opt.Shape_i)

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
--- a/theano/sandbox/gpuarray/tests/test_nerv.py
+++ b/theano/sandbox/gpuarray/tests/test_nerv.py
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py