Make a proxy NVCC_compiler that does not create a second context on the card as a side effect.

Cuda docs suggest that more than one context on a card may slow down operations, so this might give us a slight win.

Make a proxy NVCC_compiler that does not create a second context on the card as a side effect.
017b6e3e · Arnaud Bergeron · 3810c977 · 017b6e3e · 017b6e3e · 017b6e3e
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -12,7 +12,6 @@ from theano.gof.python25 import any
 from theano.gof.utils import MethodNotDefined
 from theano.compat import PY3
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu
    from pygpu import gpuarray, elemwise

--- a/theano/sandbox/gpuarray/comp.py
+++ b/theano/sandbox/gpuarray/comp.py
+import os
+import numpy
+import theano
+from theano import config
+# This is a big hack to avoid creating a second context on the card.
+from theano.sandbox.cuda.nvcc_compiler import (NVCC_compiler as NVCC_base,
+                                               hash_from_file)
+class NVCC_compiler(NVCC_base):
+    @staticmethod
+    def compile_args():
+        """
+        Re-implementation of compile_args that does not create an
+        additionnal context on the GPU.
+        """
+        flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
+        if config.nvcc.fastmath:
+            flags.append('-use_fast_math')
+        cuda_ndarray_cuh_hash = hash_from_file(
+            os.path.join(os.path.split(theano.sandbox.cuda.__file__)[0],
+                         'cuda_ndarray.cuh'))
+        flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
+        # numpy 1.7 deprecated the following macros but they didn't
+        # exist in the past
+        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
+        if bool(numpy_ver < [1, 7]):
+            flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
+            flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
+            flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
+            flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
+            flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
+            flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
+        # If the user didn't specify architecture flags add them
+        if not any(['-arch=sm_' in f for f in flags]):
+            dev = theano.sandbox.gpuarray.init_dev.device
+            if dev is None:
+                raise Exception, "Trying to compile GPU code without a context"
+            if dev.startswith("opencl"):
+                raise Exception, "Trying to call nvcc with an OpenCL context"
+            assert dev.startswith('cuda')
+            n = int(dev[4:])
+            p = theano.sandbox.cuda.device_properties(n)
+            flags.append('-arch=sm_' + str(p['major']) + str(p['minor']))
+        return flags
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -3,7 +3,7 @@ import os
 import theano
 from theano import config, gof
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.sandbox.gpuarray.comp import NVCC_compiler
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -8,9 +8,8 @@ import theano
 from theano import Apply, scalar, config
 from theano import scalar as scal
 from theano.scalar import Scalar
-from theano.tensor.elemwise import (Elemwise, DimShuffle,
+from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
-                                    CAReduceDtype)
+from theano.sandbox.gpuarray.comp import NVCC_compiler
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
@@ -2,7 +2,6 @@ import numpy
 from theano import Op, Apply, config
 from theano.gof import local_optimizer
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 from theano.sandbox.neighbours import Images2Neibs
 import theano.tensor as T
@@ -17,6 +16,7 @@ from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable,
 from theano.sandbox.gpuarray.opt import register_opt as register_gpu_opt
 from theano.sandbox.gpuarray.opt import op_lifter as op_lifter
 from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.comp import NVCC_compiler
 class GpuImages2Neibs(Images2Neibs, Op):

--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -2,7 +2,7 @@ import numpy
 from theano import Op, Apply, config
 from theano.compat.six import StringIO
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.sandbox.gpuarray.comp import NVCC_compiler
 try:

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -7,7 +7,6 @@ from theano import tensor, gof, Op
 from theano.gof.python25 import all, any
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
    import pygpu
@@ -18,6 +17,8 @@ except ImportError:
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
 from theano.sandbox.gpuarray.elemwise import GpuElemwise
+from theano.sandbox.gpuarray.comp import NVCC_compiler
 class GpuSubtensor(HideC, Subtensor):