提交 23535836 authored 作者: James Bergstra's avatar James Bergstra

many fixes to GpuDownsample ops

上级 ed85af6c
......@@ -187,13 +187,24 @@ class GpuConv(Op):
kern_align=self.logical_kern_align_top,
verbose=0)
from theano.sandbox.downsample import DownsampleFactorMax
class GpuDownsampleFactorMax(DownsampleFactorMax):
# inherit __eq__, __hash__, __str__
class GpuDownsampleFactorMax(Op):
def __init__(self, ds, ignore_border=False):
self.ds = tuple(ds)
self.ignore_border = ignore_border
def __eq__(self, other):
return type(self) == type(other) and self.ds == other.ds and self.ignore_border == other.ignore_border
def __hash__(self):
return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
def __str__(self):
return '%s{%s,%s}' % (self.__class__.__name__, self.ds, self.ignore_border)
def make_node(self, x):
return Apply(self, [x], [x.type()])
def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented')
#def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (x,), (z,), sub):
......@@ -240,7 +251,8 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
//dim3 block(std::min(dims[3], 512)); //TODO: implement this by supporting more
//outputs than threads
dim3 block(dims[3]);
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3>>>(
if ((grid.x*grid.y) && dims[3])
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3*sizeof(float)>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
......@@ -252,7 +264,14 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kMaxPool_%(nodename)s", cudaGetErrorString(err));
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kMaxPool_%(nodename)s",
cudaGetErrorString(err),
grid.x,
grid.y,
block.x,
block.y,
block.z);
%(fail)s;
}
}
......@@ -268,8 +287,8 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
float *z)
{
float cur_max, cur_x;
int i0 = blockIdx.x / D0;
int i1 = blockIdx.x %% D0;
int i0 = blockIdx.x %% D0;
int i1 = blockIdx.x / D0;
int i2 = blockIdx.y;
extern __shared__ float xbuf[]; //size [xD3]
......@@ -278,9 +297,9 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
{
__syncthreads();
// load the current row of the image into shared memory
for (int i3 = threadIdx.x; i3 < xD3; i3 += blockDim.x)
for (int j = threadIdx.x; j < xD3; j += blockDim.x)
{
xbuf[i3] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + i3*xS3];
xbuf[j] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + j*xS3];
}
__syncthreads();
......@@ -288,10 +307,24 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements in the current row
for (int k = 0; k < pf3; ++k)
if (%(ignore_border)s)
{
for (int k = 0; k < pf3; ++k)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max;
}
}
else
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x < cur_max) ? cur_x : cur_max;
for (int k = 0; k < pf3; ++k)
{
if (threadIdx.x*pf3 + k < xD3)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max;
}
}
}
}
......@@ -300,13 +333,24 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
}
""" % locals()
from theano.sandbox.downsample import DownsampleFactorMaxGrad
class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
# inherit __eq__, __hash__, __str__
class GpuDownsampleFactorMaxGrad(Op):
def __init__(self, ds, ignore_border):
self.ds = tuple(ds)
self.ignore_border = ignore_border
def __eq__(self, other):
return type(self) == type(other) and self.ds == other.ds and self.ignore_border == other.ignore_border
def __hash__(self):
return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
def __str__(self):
return '%s{%s,%s}' % (self.__class__.__name__, self.ds, self.ignore_border)
def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()])
def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented')
#def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (x, z, gz), (gx,), sub):
......@@ -338,9 +382,9 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
}
}
{
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
//TODO: implement this by supporting more
//outputs than threads
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(cnda_%(z)s)[0],
......@@ -399,9 +443,11 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
int i2 = blockIdx.y; // row wrt z and/or gz
int x_col = threadIdx.x;
// The algorithm here is that every thread writes one output pixel per line
//TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop
//in parallel
for (i1 = 0; i1 < D1; ++i1)
{
// The algorithm here is that every thread writes one output pixel per line
if (%(ignore_border)s && (x_col >= ds1 * D3))
{
my_gz = 0;
......@@ -415,7 +461,7 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (%(ignore_border)s || (x_row < xD2)); ++x_row)
{
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col]) ? my_gz : 0;
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0;
}
}
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论