提交 a012c63c authored 作者: James Bergstra's avatar James Bergstra

fixed potentially nasty out-of-bounds writes in GpuDownsampleFactorMaxGrad

上级 92cdbe39
...@@ -363,10 +363,8 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -363,10 +363,8 @@ class GpuDownsampleFactorMaxGrad(Op):
def make_node(self, x, z, gz): def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()]) return Apply(self, [x, z, gz], [x.type()])
#def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return () return (1,)
def c_code(self, node, nodename, (x, z, gz), (gx,), sub): def c_code(self, node, nodename, (x, z, gz), (gx,), sub):
fail = sub['fail'] fail = sub['fail']
ds0, ds1 = self.ds ds0, ds1 = self.ds
...@@ -396,9 +394,8 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -396,9 +394,8 @@ class GpuDownsampleFactorMaxGrad(Op):
} }
} }
{ {
//TODO: implement this by supporting more //TODO: supporting more output columns than threads
//outputs than threads dim3 grid(CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[2]);
dim3 grid(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]);
dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]); dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>( kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[0],
...@@ -441,9 +438,11 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -441,9 +438,11 @@ class GpuDownsampleFactorMaxGrad(Op):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ignore_border = int(self.ignore_border) # This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
return """ return """
template<int ds0, int ds1> template<int ds0, int ds1> // ds0 is the downsampling factor in rows, ds1 in columns
__global__ void kDownsampleMaxGrad_%(nodename)s( __global__ void kDownsampleMaxGrad_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3, int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3, const float * x, int xS0, int xS1, int xS2, int xS3,
...@@ -457,14 +456,21 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -457,14 +456,21 @@ class GpuDownsampleFactorMaxGrad(Op):
int i2 = blockIdx.y; // row wrt z and/or gz int i2 = blockIdx.y; // row wrt z and/or gz
int x_col = threadIdx.x; int x_col = threadIdx.x;
//TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop //TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop
//in parallel //in parallel
for (i1 = 0; i1 < D1; ++i1)
for (i1 = 0; i1 < D1; ++i1) // loop over images (same for z and x)
{ {
// The algorithm here is that every thread writes one output pixel per line if (x_col >= ds1 * D3)
if (%(ignore_border)s && (x_col >= ds1 * D3))
{ {
my_gz = 0; // This happens only if x_col was ignored (via ignore_border)
// TODO: if ignore_border is False, this is impossible and we don't even
// need to generate this code.
my_gz = 0.0f;
//any fp number suffices for my_z, so we don't even need to set it to
//anything in particular.
} }
else else
{ {
...@@ -472,10 +478,10 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -472,10 +478,10 @@ class GpuDownsampleFactorMaxGrad(Op):
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + (x_col/ds1)* zS3]; my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + (x_col/ds1)* zS3];
} }
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (%(ignore_border)s || (x_row < xD2)); ++x_row) for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
{ {
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col] gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0; = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0.0f;
} }
} }
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论