GpuBatchedDot: remove CNDA_THREAD_SYNC from streams implementation as it always…

GpuBatchedDot: remove CNDA_THREAD_SYNC from streams implementation as it always syncs, introduce it in the cublasSgemmBatched implementation

GpuBatchedDot: remove CNDA_THREAD_SYNC from streams implementation as it always…
4acaa2cf · Tim Cooijmans · 6cb8b04c · 4acaa2cf
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -177,9 +177,9 @@ class GpuBatchedDot(GpuOp):
                                     (const float **) gpu_y, y_dim2,
                                     (const float **) gpu_x, x_dim2, &beta,
                                     gpu_z, y_dim2, x_dim0);
+            CNDA_THREAD_SYNC;

            CLEANUP();
-
            if (CUBLAS_STATUS_SUCCESS != err)
            {
                PyErr_Format(PyExc_RuntimeError,
@@ -303,7 +303,6 @@ class GpuBatchedDot(GpuOp):

                x += Sx[0]; y += Sy[0]; z += Sz[0];
            };
-            CNDA_THREAD_SYNC;

            for(int i = 0; i < N_STREAMS; i++) {
                cudaStreamSynchronize(streams[i]);