Merge pull request #4628 from abergeron/single_stream_flag

Single stream flag

Merge pull request #4628 from abergeron/single_stream_flag
755f3648 · Pascal Lamblin · GitHub · f0a9781a · f6593660 · 755f3648
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -487,6 +487,21 @@ import theano and print the config variable, as in:
        automatically to get more memory. But this can cause
        fragmentation, see note above.
+.. attribute:: config.gpuarray.single_stream
+    Boolean value
+    Default: ``True``
+    Control the stream mode of contexts.
+    If your computations are mostly lots of small elements, using
+    single-stream will avoid the synchronization overhead and usually
+    be faster.  For larger elements it does not make a difference yet.
+    In the future when true multi-stream is enabled in libgpuarray,
+    this may change.  If you want to make sure to have optimal
+    performance, check both options.
 .. attribute:: linker

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -242,6 +242,19 @@ AddConfigVar('gpuarray.preallocate',
             FloatParam(0),
             in_c_key=False)
+AddConfigVar('gpuarray.single_stream',
+             """
+             If your computations are mostly lots of small elements,
+             using single-stream will avoid the synchronization
+             overhead and usually be faster.  For larger elements it
+             does not make a difference yet.  In the future when true
+             multi-stream is enabled in libgpuarray, this may change.
+             If you want to make sure to have optimal performance,
+             check both options.
+             """,
+             BoolParam(True),
+             in_c_key=False)
 def safe_no_dnn_workmem(workmem):
    """

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -62,7 +62,8 @@ def init_dev(dev, name=None):
    global pygpu_activated
    if dev not in init_dev.devmap:
        ctx = pygpu.init(dev,
-                         disable_alloc_cache=config.gpuarray.preallocate < 0)
+                         disable_alloc_cache=config.gpuarray.preallocate < 0,
+                         single_stream=config.gpuarray.single_stream)
        init_dev.devmap[dev] = ctx
        if config.gpuarray.preallocate > 0:
            MB = (1024 * 1024)

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -11,7 +11,7 @@ from theano.tensor import (DimShuffle, get_scalar_constant_value,
 from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
 from .elemwise import GpuDimShuffle, GpuElemwise
-_one = scal.constant(numpy.asarray(1.0, dtype='float64'))
+_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
 def grab_cpu_scalar(v, nd):