Update following code review

5922a930 · Frederic · Arnaud Bergeron · 5f3b4fab · 5922a930 · 5922a930
--- a/theano/misc/elemwise_openmp_speedup.py
+++ b/theano/misc/elemwise_openmp_speedup.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
    else:
        costlySpeed = costlyTimeOpenmp / costlyTime
        costlySpeedstring = "slowdown"
-    print("Timmed with vector of %d elements" % options.N)
+    print("Timed with vector of %d elements" % options.N)
    print("Fast op time without openmp %fs with openmp %fs %s %2.2f" % (
        cheapTime, cheapTimeOpenmp, cheapSpeedstring, cheapSpeed))

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1116,8 +1116,13 @@ def local_gpu_incsubtensor(node):
            incsubt = host_output.owner.op
            x, y = host_output.owner.inputs[0:2]
            coords = host_output.owner.inputs[2:]
-            if x.dtype != "float32" or y.dtype != "float32":
+            if x.dtype != "float32":
                return
+            if y.dtype != "float32":
+                # The IncSubtensor upcast to float32 y, so we do it
+                # explicitly to move it to the GPU.
+                y = y.astype('float32')
            return [GpuIncSubtensor(
                incsubt.idx_list,
                inplace=incsubt.inplace,