提交 23535836 authored 作者: James Bergstra's avatar James Bergstra

many fixes to GpuDownsample ops

上级 ed85af6c
...@@ -187,13 +187,24 @@ class GpuConv(Op): ...@@ -187,13 +187,24 @@ class GpuConv(Op):
kern_align=self.logical_kern_align_top, kern_align=self.logical_kern_align_top,
verbose=0) verbose=0)
from theano.sandbox.downsample import DownsampleFactorMax class GpuDownsampleFactorMax(Op):
class GpuDownsampleFactorMax(DownsampleFactorMax): def __init__(self, ds, ignore_border=False):
# inherit __eq__, __hash__, __str__ self.ds = tuple(ds)
self.ignore_border = ignore_border
def __eq__(self, other):
return type(self) == type(other) and self.ds == other.ds and self.ignore_border == other.ignore_border
def __hash__(self):
return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
def __str__(self):
return '%s{%s,%s}' % (self.__class__.__name__, self.ds, self.ignore_border)
def make_node(self, x): def make_node(self, x):
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def perform(self, node, input_storage, output_storage): #def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented') #raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
def c_code(self, node, nodename, (x,), (z,), sub): def c_code(self, node, nodename, (x,), (z,), sub):
...@@ -240,7 +251,8 @@ class GpuDownsampleFactorMax(DownsampleFactorMax): ...@@ -240,7 +251,8 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
//dim3 block(std::min(dims[3], 512)); //TODO: implement this by supporting more //dim3 block(std::min(dims[3], 512)); //TODO: implement this by supporting more
//outputs than threads //outputs than threads
dim3 block(dims[3]); dim3 block(dims[3]);
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3>>>( if ((grid.x*grid.y) && dims[3])
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3*sizeof(float)>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3, dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
...@@ -252,7 +264,14 @@ class GpuDownsampleFactorMax(DownsampleFactorMax): ...@@ -252,7 +264,14 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kMaxPool_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kMaxPool_%(nodename)s",
cudaGetErrorString(err),
grid.x,
grid.y,
block.x,
block.y,
block.z);
%(fail)s; %(fail)s;
} }
} }
...@@ -268,8 +287,8 @@ class GpuDownsampleFactorMax(DownsampleFactorMax): ...@@ -268,8 +287,8 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
float *z) float *z)
{ {
float cur_max, cur_x; float cur_max, cur_x;
int i0 = blockIdx.x / D0; int i0 = blockIdx.x %% D0;
int i1 = blockIdx.x %% D0; int i1 = blockIdx.x / D0;
int i2 = blockIdx.y; int i2 = blockIdx.y;
extern __shared__ float xbuf[]; //size [xD3] extern __shared__ float xbuf[]; //size [xD3]
...@@ -278,9 +297,9 @@ class GpuDownsampleFactorMax(DownsampleFactorMax): ...@@ -278,9 +297,9 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
{ {
__syncthreads(); __syncthreads();
// load the current row of the image into shared memory // load the current row of the image into shared memory
for (int i3 = threadIdx.x; i3 < xD3; i3 += blockDim.x) for (int j = threadIdx.x; j < xD3; j += blockDim.x)
{ {
xbuf[i3] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + i3*xS3]; xbuf[j] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + j*xS3];
} }
__syncthreads(); __syncthreads();
...@@ -288,10 +307,24 @@ class GpuDownsampleFactorMax(DownsampleFactorMax): ...@@ -288,10 +307,24 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max; cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements in the current row // do a mini-reduction over the pf3 relevant elements in the current row
if (%(ignore_border)s)
{
for (int k = 0; k < pf3; ++k)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max;
}
}
else
{
for (int k = 0; k < pf3; ++k) for (int k = 0; k < pf3; ++k)
{
if (threadIdx.x*pf3 + k < xD3)
{ {
cur_x = xbuf[threadIdx.x*pf3+k]; cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x < cur_max) ? cur_x : cur_max; cur_max = (cur_x > cur_max) ? cur_x : cur_max;
}
}
} }
} }
...@@ -300,13 +333,24 @@ class GpuDownsampleFactorMax(DownsampleFactorMax): ...@@ -300,13 +333,24 @@ class GpuDownsampleFactorMax(DownsampleFactorMax):
} }
""" % locals() """ % locals()
from theano.sandbox.downsample import DownsampleFactorMaxGrad class GpuDownsampleFactorMaxGrad(Op):
class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad): def __init__(self, ds, ignore_border):
# inherit __eq__, __hash__, __str__ self.ds = tuple(ds)
self.ignore_border = ignore_border
def __eq__(self, other):
return type(self) == type(other) and self.ds == other.ds and self.ignore_border == other.ignore_border
def __hash__(self):
return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
def __str__(self):
return '%s{%s,%s}' % (self.__class__.__name__, self.ds, self.ignore_border)
def make_node(self, x, z, gz): def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()]) return Apply(self, [x, z, gz], [x.type()])
def perform(self, node, input_storage, output_storage): #def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented') #raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
def c_code(self, node, nodename, (x, z, gz), (gx,), sub): def c_code(self, node, nodename, (x, z, gz), (gx,), sub):
...@@ -338,9 +382,9 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad): ...@@ -338,9 +382,9 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
} }
} }
{ {
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
//TODO: implement this by supporting more //TODO: implement this by supporting more
//outputs than threads //outputs than threads
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]); dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>( kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(cnda_%(z)s)[0], CudaNdarray_HOST_DIMS(cnda_%(z)s)[0],
...@@ -399,9 +443,11 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad): ...@@ -399,9 +443,11 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
int i2 = blockIdx.y; // row wrt z and/or gz int i2 = blockIdx.y; // row wrt z and/or gz
int x_col = threadIdx.x; int x_col = threadIdx.x;
// The algorithm here is that every thread writes one output pixel per line //TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop
//in parallel
for (i1 = 0; i1 < D1; ++i1) for (i1 = 0; i1 < D1; ++i1)
{ {
// The algorithm here is that every thread writes one output pixel per line
if (%(ignore_border)s && (x_col >= ds1 * D3)) if (%(ignore_border)s && (x_col >= ds1 * D3))
{ {
my_gz = 0; my_gz = 0;
...@@ -415,7 +461,7 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad): ...@@ -415,7 +461,7 @@ class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (%(ignore_border)s || (x_row < xD2)); ++x_row) for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (%(ignore_border)s || (x_row < xD2)); ++x_row)
{ {
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col] gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col]) ? my_gz : 0; = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0;
} }
} }
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论