提交 9411e656 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Fix GpuDownSample...Grad with non-contiguous output memory

上级 529e4c52
...@@ -933,7 +933,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -933,7 +933,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
return Apply(self, [x, z, gz], [x.type()]) return Apply(self, [x, z, gz], [x.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, z, gz = inp x, z, gz = inp
...@@ -1001,7 +1001,11 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1001,7 +1001,11 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
CudaNdarray_HOST_STRIDES(%(gz)s)[1], CudaNdarray_HOST_STRIDES(%(gz)s)[1],
CudaNdarray_HOST_STRIDES(%(gz)s)[2], CudaNdarray_HOST_STRIDES(%(gz)s)[2],
CudaNdarray_HOST_STRIDES(%(gz)s)[3], CudaNdarray_HOST_STRIDES(%(gz)s)[3],
CudaNdarray_DEV_DATA(%(gx)s)); CudaNdarray_DEV_DATA(%(gx)s),
CudaNdarray_HOST_STRIDES(%(gx)s)[0],
CudaNdarray_HOST_STRIDES(%(gx)s)[1],
CudaNdarray_HOST_STRIDES(%(gx)s)[2],
CudaNdarray_HOST_STRIDES(%(gx)s)[3]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -1039,7 +1043,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1039,7 +1043,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
const float * x, int xS0, int xS1, int xS2, int xS3, const float * x, int xS0, int xS1, int xS2, int xS3,
const float * z, int zS0, int zS1, int zS2, int zS3, const float * z, int zS0, int zS1, int zS2, int zS3,
const float * gz, int gzS0, int gzS1, int gzS2, int gzS3, const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
float *gx) float *gx, int gxS0, int gxS1, int gxS2, int gxS3)
{ {
// D0: number of image rows // D0: number of image rows
// D1: number of image cols // D1: number of image cols
...@@ -1050,6 +1054,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1050,6 +1054,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// various .S. variables are strides // various .S. variables are strides
float cur_max, cur_x, my_z, my_gz; float cur_max, cur_x, my_z, my_gz;
// Cast threadIdx.x into a signed int, to avoid problems with
// indexing with negative offsets.
int tx = threadIdx.x;
for(int i0 = blockIdx.x; for(int i0 = blockIdx.x;
i0 < D0; i0 < D0;
i0 += gridDim.x){ i0 += gridDim.x){
...@@ -1058,7 +1066,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1058,7 +1066,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2 // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
// (as needed to cover all x rows) // (as needed to cover all x rows)
int i2 = blockIdx.y; int i2 = blockIdx.y;
int x_col = threadIdx.x; // col wrt x, ranges from 0 to xD3 - 1 int x_col = tx; // col wrt x, ranges from 0 to xD3 - 1
int z_col = x_col/ds1; // z_col corresponding to this x_col int z_col = x_col/ds1; // z_col corresponding to this x_col
...@@ -1075,7 +1083,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1075,7 +1083,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
if(blockDim.x != xD3) if(blockDim.x != xD3)
{ {
x_col = threadIdx.x + col_iter * blockDim.x; x_col = tx + col_iter * blockDim.x;
z_col = x_col/ds1; z_col = x_col/ds1;
} }
...@@ -1110,13 +1118,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1110,13 +1118,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// gx[image_row][image_col][x_row][x_col] // gx[image_row][image_col][x_row][x_col]
// = (my_z == x[image_row][image_col][ // = (my_z == x[image_row][image_col][
// x_row][x_col]) ? my_gz : 0.0f; // x_row][x_col]) ? my_gz : 0.0f;
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3]
x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 +
x_col*xS3]) ? my_gz : 0.0f; x_col*xS3]) ? my_gz : 0.0f;
} }
//gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
// x_row*xD3 + x_col] = -999;
} }
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论