Merge pull request #6264 from notoraptor/remove-config-gpuarray-sync

Remove config.gpuarray.sync from Theano code.

Merge pull request #6264 from notoraptor/remove-config-gpuarray-sync
1414b2d2 · Frédéric Bastien · GitHub · d003afce · f8a7c0ed · 1414b2d2
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -193,11 +193,14 @@ AddConfigVar(
    in_c_key=False)
+def deprecated_gpuarray_sync(val):
+    if val:
+        raise RuntimeError("Flag gpuarray.sync is deprecated and will be removed in next Theano release.")
+    return False
 AddConfigVar('gpuarray.sync',
-             """If True, every op will make sure its work is done before
+             """This flag is deprecated and will be removed in next Theano release.""",
-                returning.  Setting this to True will slow down execution,
+             ConfigParam(False, allow_override=False, filter=deprecated_gpuarray_sync),
-                but give much more accurate results in profiling.""",
-             BoolParam(False),
             in_c_key=True)
 AddConfigVar('gpuarray.preallocate',

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -849,8 +849,6 @@ class GpuAlloc(HideC, Alloc):
                out[0][...] = v
        else:
            out[0][...] = v
-        if config.gpuarray.sync:
-            out[0].sync()
    def c_code(self, node, name, inp, out, sub):
        vv = inp[0]
@@ -915,13 +913,10 @@ class GpuAlloc(HideC, Alloc):
        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['params'],
                   fail=sub['fail'], memset_0=memset_0)
-        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
        return code
    def c_code_cache_version(self):
-        return (3,)
+        return (4,)
    def do_constant_folding(self, node):
        from . import subtensor, blas
@@ -1382,7 +1377,7 @@ class GpuSplit(HideC, Split):
    # we reuse the perform of the CPU op, which is suitable
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray_helper.h>']
@@ -1514,13 +1509,6 @@ class GpuSplit(HideC, Split):
        free(split_points);
        """
-        if config.gpuarray.sync:
-            main_code += """
-        for (i = 0; i < splits_count; ++i) {
-            GpuArray_sync(&((*outputs[i])->ga));
-        }
-        """
        return main_code % locals()
@@ -1649,7 +1637,6 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
        fail = sub['fail']
        ctx = sub['params']
        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
-        sync = bool(config.gpuarray.sync)
        kname = self.gpu_kernels(node, name)[0].objvar
        s = """
        size_t dims[2] = {0, 0};
@@ -1689,11 +1676,9 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
            }
        }
-        if(%(sync)d)
-            GpuArray_sync(&%(z)s->ga);
        """ % locals()
        return s
    def c_code_cache_version(self):
-        return (9,)
+        return (10,)
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -3,7 +3,7 @@ import os.path
 from six import integer_types
 import theano
-from theano import Apply, config, Op
+from theano import Apply, Op
 from theano.compile import optdb
 from theano.gof import LocalOptGroup, ParamsType
@@ -133,14 +133,11 @@ class GpuGemv(BlasOp):
            %(fail)s
        }
        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
        return code
    def c_code_cache_version(self):
-        return (9,)
+        return (10,)
 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)
@@ -222,14 +219,11 @@ class GpuGemm(BlasOp):
                 %(fail)s
               }
        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
        return code
    def c_code_cache_version(self):
-        return (6,)
+        return (7,)
 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)
@@ -293,14 +287,11 @@ class GpuGer(BlasOp):
                 %(fail)s
               }
               """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
        return code
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
 gpuger_no_inplace = GpuGer(inplace=False)
@@ -361,14 +352,11 @@ class GpuDot22(BlasOp):
            %(fail)s
        }
        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
        return code
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
 gpu_dot22 = GpuDot22()
@@ -444,14 +432,11 @@ class GpuGemmBatch(BlasOp):
            %(fail)s;
        }
        """ % vars
-        if config.gpuarray.sync:
-            code += """
-            GpuArray_sync(&%(out)s->ga);
-            """ % vars
        return code
    def c_code_cache_version(self):
-        return (3,)
+        return (4,)
 gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
 gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
@@ -549,7 +534,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def c_code_cache_version(self):
        # Raise this whenever modifying the C code (including the file).
-        return (9,)
+        return (10,)
    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -633,16 +618,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
            width = '-1'
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            int err = GpuArray_sync(&%(out)s->ga);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "BaseGpuCorrMM error: gpuarray sync failed.");
-                %(fail)s;
-            }
-            """ % locals()
        sub = sub.copy()
        sub.update(locals())
@@ -831,8 +807,6 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }
    assert (out2 == %(out)s);
-    %(sync)s
 """ % sub
@@ -1161,7 +1135,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
    def c_code_cache_version(self):
        # raise this whenever modifying the code below.
-        return (7,)
+        return (8,)
    def c_code_helper(self, bottom, weights, top, direction, sub,
                      height=None, width=None, depth=None):
@@ -1258,16 +1232,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
            if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
            depth = '-1'
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            int err = GpuArray_sync(&%(out)s->ga);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "BaseGpuCorr3dMM error: gpuarray sync failed.");
-                %(fail)s;
-            }
-            """ % locals()
        sub = sub.copy()
        sub.update(locals())
@@ -1490,8 +1455,6 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
    }
    assert (out2 == %(out)s);
-    %(sync)s
 """ % sub

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -3,7 +3,7 @@ import copy
 import numpy as np
 import theano
-from theano import Apply, scalar, config, Op
+from theano import Apply, scalar, Op
 from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
 from theano.scalar import Scalar, Composite
@@ -371,18 +371,6 @@ class GpuElemwise(HideC, Elemwise):
        }
        """ % dict(fail=sub['fail'])
-        if config.gpuarray.sync:
-            z = outputs[0]
-            code += """
-            err = GpuArray_sync(&%(z)s->ga);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: %%s.",
-                             GpuArray_error(&%(z)s->ga, err));
-                %(fail)s;
-            }
-            """ % locals()
        return str(code)
    # To disable the superclass perform.
@@ -395,7 +383,7 @@ class GpuElemwise(HideC, Elemwise):
    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (8, ver)
+            return (9, ver)
        else:
            return ver
@@ -843,15 +831,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            %(err_check)s
            """ % locals(), file=sio)
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
-        print("""
-            %(sync)s
-        """ % locals(), file=sio)
        return sio.getvalue()
    def _k_decl(self, node, nodename, pattern=None,
@@ -1128,12 +1107,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
        print("""
        {
          if(PyGpuArray_SIZE(%(x)s)==0){
@@ -1155,7 +1129,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
            int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params);
            %(err_check)s
-            %(sync)s
         }
        }
        """ % locals(), file=sio)
@@ -1272,12 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
        print("""
    {
        int verbose = 0;
@@ -1325,7 +1293,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)&stride_Z0, (void *)&stride_Z1};
                int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
                %(err_check)s
-                %(sync)s
        }else{
            GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
@@ -1354,7 +1321,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                    (void *)&stride_Z0, (void *)&stride_Z1};
            int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
            %(err_check)s
-            %(sync)s
        }
    }
        """ % locals(), file=sio)
@@ -1375,12 +1341,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
        print("""
        {
            //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
@@ -1429,7 +1389,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)&stride_Z0, (void *)&stride_Z1};
                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
                %(err_check)s
-                %(sync)s
            }
            else
            {
@@ -1470,7 +1429,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                      );
                  %(makecall)s
                }
-                %(sync)s
            }
        }
        """ % locals(), file=sio)
@@ -1506,12 +1464,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
        # use threadIdx.x for i0
        # use blockIdx.x for i1
        # use blockIdx.y for i2
@@ -1562,7 +1514,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                        (void *)&stride_Z0, (void *)&stride_Z1};
                int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
                %(err_check)s
-                %(sync)s
            }
        }
        """ % locals(), file=sio)
@@ -1749,7 +1700,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_cache_version_apply(self, node):
-        version = [20]  # the version corresponding to the c code in this Op
+        version = [21]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(
@@ -2581,10 +2532,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            %(fail)s
        }
-        if (%(sync)d)
+        """ % dict(out=out[0], inp=inp[0], fail=sub['fail'])
-            GpuArray_sync(&%(out)s->ga);
-        """ % dict(out=out[0], inp=inp[0], fail=sub['fail'],
-                   sync=bool(config.gpuarray.sync))
        k = self.get_kernel_cache(node)
        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
                                            node.inputs[0].ndim)
@@ -2719,23 +2667,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            %(output)s = tmp;
        }
-        if (%(sync)d) {
+        """ % dict(k_var='k_reduk_' + name,
-            err = GpuArray_sync(&%(output)s->ga);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: GpuCAReduceCPY: %%s.",
-                             GpuKernel_error(&%(k_var)s, err));
-                %(fail)s
-            }
-        }
-        """ % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
                   ls=ls, fail=sub['fail'], output=output, input=input,
                   cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
        return code
    def c_code_cache_version_apply(self, node):
-        return (2, self.kernel_version(node))
+        return (3, self.kernel_version(node))
    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -10,7 +10,7 @@ except ImportError:
 import theano
 import theano.sandbox.multinomial
-from theano import Apply, config
+from theano import Apply
 from theano.gof import Op
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
@@ -137,7 +137,6 @@ KERNEL void k_multi_warp_multinomial(
        out, = outputs
        fail = sub['fail']
        ctx = sub['params']
-        sync = bool(config.gpuarray.sync)
        kname = self.gpu_kernels(node, name)[0].objvar
        out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
@@ -212,15 +211,14 @@ KERNEL void k_multi_warp_multinomial(
                GpuKernel_error(&%(kname)s, err));
            %(fail)s;
        }
-        if(%(sync)d)
-            GpuArray_sync(&(out->ga));
    } // END NESTED SCOPE
        """ % locals()
        return s
    def c_code_cache_version(self):
-        return (5,)
+        return (6,)
 class GPUAChoiceFromUniform(GpuKernelBase, Op):
@@ -360,7 +358,6 @@ KERNEL void k_multi_warp_multinomial_wor(
        replace = int(self.replace)
        fail = sub['fail']
        ctx = sub['params']
-        sync = bool(config.gpuarray.sync)
        kname = self.gpu_kernels(node, name)[0].objvar
        s = """
    PyGpuArrayObject * pvals = %(pvals)s;
@@ -447,15 +444,14 @@ KERNEL void k_multi_warp_multinomial_wor(
           Py_DECREF(pvals_copy);
           %(fail)s;
        }
-        if(%(sync)d)
-            GpuArray_sync(&(out->ga));
        Py_DECREF(pvals_copy);
    } // END NESTED SCOPE
        """ % locals()
        return s
    def c_code_cache_version(self):
-        return (9,)
+        return (10,)
 @register_opt('fast_compile')

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
 from __future__ import absolute_import, print_function, division
-from theano import Op, Apply, config
+from theano import Op, Apply
 from theano.gof import ParamsType
 from theano.tensor.nnet.neighbours import Images2Neibs
 import theano.tensor as T
@@ -45,7 +45,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                   context_name=ten4.type.context_name)()])
    def c_code_cache_version(self):
-        return (13,)
+        return (14,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -284,12 +284,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                %(fail)s;
            }
        """ % dict(fail=sub['fail'])
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % dict(z=out[0], err_check=err_check)
        # NB: To reduce C code variability:
        # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
        # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
@@ -563,13 +558,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                                     (void *)&%(z)s->ga.offset};
            err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
            %(err_check)s
-            %(sync)s
        } // END NESTED SCOPE
        """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
                   dtype_neib_shape=node.inputs[1].dtype,
                   dtype_neib_step=node.inputs[2].dtype,
                   err_check=err_check,
-                   sync=sync,
                   name=name,
                   params=sub['params'],
                   fail=sub['fail'])

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
 import os
 import numpy as np
-from theano import Op, Apply, config
+from theano import Op, Apply
 from six import StringIO
 try:
@@ -187,12 +187,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
        sio = StringIO()
        print("""
        if (PyGpuArray_DIMS(%(x)s)[0] !=
@@ -235,13 +229,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                %(am)s->ga.data, %(am)s->ga.offset,
                PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
            %(err_check)s
-            %(sync)s
        }
        """ % locals(), file=sio)
        return sio.getvalue()
    def c_code_cache_version(self):
-        return (13,)
+        return (14,)
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
@@ -267,7 +260,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return Apply(self, [dnll, sm, y_idx], [sm.type()])
    def c_code_cache_version(self):
-        return (13,)
+        return (14,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -296,12 +289,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            %(err_check)s
-            """ % locals()
        return """
        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
@@ -378,7 +365,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
                (void *)&stride_DX0, (void *)&stride_DX1};
            int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
            %(err_check)s
-            %(sync)s
        }
        assert(%(dx)s);
        """ % locals()
@@ -465,7 +451,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        return shape
    def c_code_cache_version(self):
-        return (16,)
+        return (17,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -487,15 +473,6 @@ class GpuSoftmax(GpuKernelBase, Op):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            msg = "sync error";
-            %(err_check)s
-            """ % locals()
-        else:
-            sync = ""
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -555,7 +532,6 @@ class GpuSoftmax(GpuKernelBase, Op):
                msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
              }
              %(err_check)s
-              %(sync)s
            }
        }
        assert(%(z)s);
@@ -751,7 +727,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        return [shape[0]]
    def c_code_cache_version(self):
-        return (15,)
+        return (16,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -775,13 +751,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
                %(fail)s;
            }
        """ % locals()
-        sync = ""
-        if config.gpuarray.sync:
-            sync = """
-            err = GpuArray_sync(&%(z)s->ga);
-            msg = "sync error";
-            %(err_check)s
-            """ % locals()
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -856,7 +825,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
                msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
              }
              %(err_check)s
-              %(sync)s
            }
        }
        assert(%(z)s);

--- a/theano/gpuarray/reduction.py
+++ b/theano/gpuarray/reduction.py
 from __future__ import print_function, absolute_import, division
 import os
-import theano
 from theano.gof import Op, Apply
 from theano.gof.type import Generic
@@ -125,11 +124,6 @@ class GpuMaxAndArgmax(Op):
            %(fail)s
        }
        """
-        if theano.config.gpuarray.sync:
-            ret += """
-            GpuArray_sync(&%(max)s->ga);
-            GpuArray_sync(&%(argmax)s->ga);
-            """
        return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
                      'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
                      'name': name, 'fail': sub['fail']}
@@ -141,4 +135,4 @@ class GpuMaxAndArgmax(Op):
        """ % {'name': name, 'X': inputs[0]}
    def c_code_cache_version(self):
-        return (1, 1)
+        return (2,)