提交 2bf5a874 authored 作者: Frederic Bastien's avatar Frederic Bastien

BUGFIX in GpuDownsampleFactorMaxGrad from Josh Bleecher Snyder.

上级 58cfa9ea
...@@ -570,7 +570,9 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -570,7 +570,9 @@ class GpuDownsampleFactorMaxGrad(Op):
def make_node(self, x, z, gz): def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()]) return Apply(self, [x, z, gz], [x.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) #return ()
return (2,)
def c_code(self, node, nodename, (x, z, gz), (gx,), sub): def c_code(self, node, nodename, (x, z, gz), (gx,), sub):
fail = sub['fail'] fail = sub['fail']
ds0, ds1 = self.ds ds0, ds1 = self.ds
...@@ -601,7 +603,9 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -601,7 +603,9 @@ class GpuDownsampleFactorMaxGrad(Op):
} }
{ {
//TODO: supporting more output columns than threads //TODO: supporting more output columns than threads
dim3 grid(CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[2]); // make sure we cover every x row when ignore border isset and there's a border present to be ignored
int needs_extra_z_col = %(ignore_border)s && (CudaNdarray_HOST_DIMS(%(x)s)[2] %% %(ds0)s);
dim3 grid(CudaNdarray_HOST_DIMS(%(z)s)[0],CudaNdarray_HOST_DIMS(%(z)s)[2] + (needs_extra_z_col ? 1 : 0));
dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]); dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>( kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[0],
...@@ -644,9 +648,14 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -644,9 +648,14 @@ class GpuDownsampleFactorMaxGrad(Op):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code considers every position in the output z, andthen computes the gradient for the
# It runs for every position in the output z, and then computes the gradient for the # input pixels that were downsampled to that z-position. It does so by running along every
# input pixels that were downsampled to that z-position. # z row (sometimes plus one, to make sure every gx row gets totally filled), and by
# running along every x col. This code is not sensitive to the ignore_border flag along
# the row dimension (since it runs for every position in the output z), but it is sensitive
# along the col dimension.
ignore_border = int(self.ignore_border)
return """ return """
template<int ds0, int ds1> // ds0 is the downsampling factor in rows, ds1 in columns template<int ds0, int ds1> // ds0 is the downsampling factor in rows, ds1 in columns
__global__ void kDownsampleMaxGrad_%(nodename)s( __global__ void kDownsampleMaxGrad_%(nodename)s(
...@@ -656,11 +665,20 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -656,11 +665,20 @@ class GpuDownsampleFactorMaxGrad(Op):
const float * gz, int gzS0, int gzS1, int gzS2, int gzS3, const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
float *gx) float *gx)
{ {
// D0: number of image rows
// D1: number of image cols
// D2: number of z rows
// D3: number of z cols
// xD2: number of x rows
// xD3: number of x cols
// various .S. variables are strides
float cur_max, cur_x, my_z, my_gz; float cur_max, cur_x, my_z, my_gz;
int i0 = blockIdx.x; int i0 = blockIdx.x; // image row
int i1 = 0; int i1 = 0; // image col
int i2 = blockIdx.y; // row wrt z and/or gz int i2 = blockIdx.y; // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2 (as needed to cover all x rows)
int x_col = threadIdx.x; int x_col = threadIdx.x; // col wrt x, ranges from 0 to xD3 - 1
int z_col = x_col/ds1; // z_col corresponding to this x_col
//TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop //TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop
...@@ -680,12 +698,18 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -680,12 +698,18 @@ class GpuDownsampleFactorMaxGrad(Op):
} }
else else
{ {
my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + (x_col/ds1)*gzS3]; // this is effectively:
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + (x_col/ds1)* zS3]; // my_gz = gz[image_row][image_col][z_row][z_col]
// my_z = z[image_row][image_col][z_row][z_col]
my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + z_col*gzS3];
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + z_col* zS3];
} }
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row) for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
{ {
// this is effectively:
// gx[image_row][image_col][x_row][x_col]
// = (my_z ==
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col] gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0.0f; = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0.0f;
} }
......
...@@ -144,12 +144,12 @@ def test_downsample(): ...@@ -144,12 +144,12 @@ def test_downsample():
for shp in [ for shp in [
(1, 1, 1, 12), (1, 1, 1, 12),
(1, 1, 2, 2), (1, 1, 2, 2),
#(1, 1, 1, 1), #### Commented out because it makes FP-exception that I don't understand (1, 1, 1, 1),
(1,1,4,4), (1,1,4,4),
(1, 1, 10, 11), (1, 1, 10, 11),
(1, 2, 2, 2), (1, 2, 2, 2),
(3,5,4,4), (3,5,4,4),
(25, 1, 7, 7),#test odd number this already found a bug. (25, 1, 7, 7),
(1, 1, 12, 12), (1, 1, 12, 12),
(1, 1, 2, 14), (1, 1, 2, 14),
(1, 1, 12, 14), (1, 1, 12, 14),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论