提交 a43ea7a1 authored 作者: Ian Goodfellow's avatar Ian Goodfellow

gave c_support_code_apply case-specific op guards

上级 ec8100d0
...@@ -1486,10 +1486,10 @@ class GpuCAReduce(GpuOp): ...@@ -1486,10 +1486,10 @@ class GpuCAReduce(GpuOp):
raise NotImplementedError() raise NotImplementedError()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
self._op_guard()
sio = StringIO.StringIO() sio = StringIO.StringIO()
nd_in = len(self.reduce_mask) nd_in = len(self.reduce_mask)
if all(i == 1 for i in self.reduce_mask): if all(i == 1 for i in self.reduce_mask):
self._op_guard()
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
...@@ -1517,6 +1517,7 @@ class GpuCAReduce(GpuOp): ...@@ -1517,6 +1517,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
self._op_guard()
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
...@@ -1545,6 +1546,7 @@ class GpuCAReduce(GpuOp): ...@@ -1545,6 +1546,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 1): if self.reduce_mask == (1, 1):
self._op_guard()
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
...@@ -1580,6 +1582,7 @@ class GpuCAReduce(GpuOp): ...@@ -1580,6 +1582,7 @@ class GpuCAReduce(GpuOp):
if (0 == self.reduce_mask[0] and if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]): nd_in in[2, 3, 4]):
self._op_guard()
# this kernel uses one block for each row. # this kernel uses one block for each row.
# threads per block for each element per row. # threads per block for each element per row.
...@@ -1622,6 +1625,7 @@ class GpuCAReduce(GpuOp): ...@@ -1622,6 +1625,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
self._op_guard()
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -1664,6 +1668,7 @@ class GpuCAReduce(GpuOp): ...@@ -1664,6 +1668,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
self._op_guard()
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_AD_%(nodename)s( static __global__ void kernel_reduce_010_AD_%(nodename)s(
const int A, const int A,
...@@ -1704,6 +1709,7 @@ class GpuCAReduce(GpuOp): ...@@ -1704,6 +1709,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
self._op_guard()
# #
# This kernel is optimized when the inner most dimensions # This kernel is optimized when the inner most dimensions
# have the smallest stride. # have the smallest stride.
...@@ -1745,6 +1751,7 @@ class GpuCAReduce(GpuOp): ...@@ -1745,6 +1751,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 1, 0): if self.reduce_mask == (1, 1, 0):
self._op_guard()
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -1786,6 +1793,7 @@ class GpuCAReduce(GpuOp): ...@@ -1786,6 +1793,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 0, 0): if self.reduce_mask == (1, 0, 0):
self._op_guard()
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]') reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
...@@ -1808,6 +1816,7 @@ class GpuCAReduce(GpuOp): ...@@ -1808,6 +1816,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 1, 1): if self.reduce_mask == (1, 1, 1):
self._op_guard()
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
...@@ -1830,6 +1839,7 @@ class GpuCAReduce(GpuOp): ...@@ -1830,6 +1839,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 0, 1): if self.reduce_mask == (0, 0, 1):
self._op_guard()
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
...@@ -1866,6 +1876,7 @@ class GpuCAReduce(GpuOp): ...@@ -1866,6 +1876,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 0, 1, 1): if self.reduce_mask == (0, 0, 1, 1):
self._op_guard()
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
...@@ -1894,6 +1905,7 @@ class GpuCAReduce(GpuOp): ...@@ -1894,6 +1905,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0, 1): if self.reduce_mask == (0, 1, 0, 1):
self._op_guard()
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]')
...@@ -1922,6 +1934,7 @@ class GpuCAReduce(GpuOp): ...@@ -1922,6 +1934,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 1, 1, 1): if self.reduce_mask == (1, 1, 1, 1):
self._op_guard()
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
...@@ -1945,6 +1958,7 @@ class GpuCAReduce(GpuOp): ...@@ -1945,6 +1958,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1):
self._op_guard()
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]') reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_1011_%(nodename)s( static __global__ void kernel_reduce_1011_%(nodename)s(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论