提交 ff87ad08 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix reduction in a limited case.

上级 5a1e170b
...@@ -1885,7 +1885,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1885,7 +1885,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [12] # the version corresponding to the c code in this Op version = [13] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -1906,6 +1906,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1906,6 +1906,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
load_in = load_w(node.inputs[0].dtype) load_in = load_w(node.inputs[0].dtype)
write_out = write_w(node.outputs[0].dtype)
if all(i == 1 for i in self.reduce_mask): if all(i == 1 for i in self.reduce_mask):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
...@@ -2159,7 +2160,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2159,7 +2160,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
%(reduce_fct)s; %(reduce_fct)s;
} }
Z[a * sZ0 + c * sZ1] = myresult; Z[a * sZ0 + c * sZ1] = %(write_out)s(myresult);
} }
} }
} }
...@@ -2240,7 +2241,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2240,7 +2241,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
if (warpSize != 32) if (warpSize != 32)
{ {
//TODO: set error code //TODO: set error code
Z[blockIdx.x * sZ0] = -666; Z[blockIdx.x * sZ0] = %(write_out)s(-666);
return; return;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论