提交 cee77d13 authored 作者: James Bergstra's avatar James Bergstra

added maxpooling op GpuDownsampleMax

上级 3272fee0
......@@ -187,3 +187,238 @@ class GpuConv(Op):
kern_align=self.logical_kern_align_top,
verbose=0)
from theano.sandbox.downsample import DownsampleFactorMax
class GpuDownsampleFactorMax(DownsampleFactorMax):
# inherit __eq__, __hash__, __str__
def make_node(self, x):
return Apply(self, [x], [x.type()])
def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail']
ds0, ds1 = self.ds
ignore_border = int(self.ignore_border)
return """
int dims[4], xdim2, xdim3;
if (cnda_%(x)s->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
xdim2 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2];
xdim3 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[3];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[1];
dims[2] = xdim2 / %(ds0)s;
dims[3] = xdim3 / %(ds1)s;
if (! %(ignore_border)s)
{
dims[2] += (xdim2%%(%(ds0)s)?1:0);
dims[3] += (xdim3%%(%(ds1)s)?1:0);
}
if ((NULL == cnda_%(z)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != dims[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != dims[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[2] != dims[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[3] != dims[3]))
{
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s)
|| CudaNdarray_alloc_contiguous(cnda_%(z)s, 4, dims))
{
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
%(fail)s;
}
}
{
dim3 grid(dims[0] * dims[1], dims[2]);
//dim3 block(std::min(dims[3], 512)); //TODO: implement this by supporting more
//outputs than threads
dim3 block(dims[3]);
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kMaxPool_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
""" % locals()
def c_support_code_apply(self, node, nodename):
ignore_border = int(self.ignore_border)
return """
template<int pf2, int pf3>
__global__ void kMaxPool_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3,
float *z)
{
float cur_max, cur_x;
int i0 = blockIdx.x / D0;
int i1 = blockIdx.x %% D0;
int i2 = blockIdx.y;
extern __shared__ float xbuf[]; //size [xD3]
for (int r2 = 0; (r2 < pf2) && (%(ignore_border)s || (r2 + i2*pf2 < xD2)); ++r2)
{
__syncthreads();
// load the current row of the image into shared memory
for (int i3 = threadIdx.x; i3 < xD3; i3 += blockDim.x)
{
xbuf[i3] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + i3*xS3];
}
__syncthreads();
// initialize our max if this is the first row we're loading
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements in the current row
for (int k = 0; k < pf3; ++k)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x < cur_max) ? cur_x : cur_max;
}
}
//store the result to global memory
z[i0 * D1*D2*D3 + i1*D2*D3 + i2*D3 + threadIdx.x] = cur_max;
}
""" % locals()
from theano.sandbox.downsample import DownsampleFactorMaxGrad
class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
# inherit __eq__, __hash__, __str__
def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()])
def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (x, z, gz), (gx,), sub):
fail = sub['fail']
ds0, ds1 = self.ds
ignore_border = int(self.ignore_border)
return """
if (cnda_%(x)s->nd != 4
|| cnda_%(z)s->nd != 4
|| cnda_%(gz)s->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == cnda_%(gx)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[2] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[3] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]))
{
Py_XDECREF(cnda_%(gx)s);
cnda_%(gx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(gx)s)
|| CudaNdarray_alloc_contiguous(cnda_%(gx)s, 4, CudaNdarray_HOST_DIMS(cnda_%(x)s)))
{
Py_XDECREF(cnda_%(gx)s);
cnda_%(gx)s = NULL;
%(fail)s;
}
}
{
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
//TODO: implement this by supporting more
//outputs than threads
dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(cnda_%(z)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[3],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gz)s),
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gx)s));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kDownsampleMaxGrad_%(nodename)s",
cudaGetErrorString(err),
grid.x,
grid.y,
block.x,
block.y,
block.z);
%(fail)s;
}
}
""" % locals()
def c_support_code_apply(self, node, nodename):
ignore_border = int(self.ignore_border)
return """
template<int ds0, int ds1>
__global__ void kDownsampleMaxGrad_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3,
const float * z, int zS0, int zS1, int zS2, int zS3,
const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
float *gx)
{
float cur_max, cur_x, my_z, my_gz;
int i0 = blockIdx.x;
int i1 = 0;
int i2 = blockIdx.y; // row wrt z and/or gz
int x_col = threadIdx.x;
// The algorithm here is that every thread writes one output pixel per line
for (i1 = 0; i1 < D1; ++i1)
{
if (%(ignore_border)s && (x_col >= ds1 * D3))
{
my_gz = 0;
}
else
{
my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + (x_col/ds1)*gzS3];
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + (x_col/ds1)* zS3];
}
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (%(ignore_border)s || (x_row < xD2)); ++x_row)
{
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col]) ? my_gz : 0;
}
}
}
""" % locals()
......@@ -4,6 +4,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from theano_cuda_ndarray.basic_ops import *
from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv
from theano_cuda_ndarray.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano_cuda_ndarray.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
......@@ -148,42 +149,6 @@ def local_gpu_sum(node):
return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
return False
import theano.sandbox.conv
@register_opt()
@local_optimizer([])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(conv)
"""
def GpuConvOp_from_ConvOp(op):
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=op.imshp_logical[1:3],
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned
)
#HACK to print the number of MFlops in the profiler output.
if hasattr(op,'flops'):
ret.flops=op.flops
return ret
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
img, kern = host_input.owner.inputs
return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
if isinstance(node.op, theano.sandbox.conv.ConvOp):
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
@register_opt()
@local_optimizer([])
def local_gpu_reshape(node):
......@@ -265,3 +230,61 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
gpu_from_host(cast(yidx, 'float32')))
return [host_from_gpu(gpu_dx)]
return False
#### Convolution, maxpooling
import theano.sandbox.conv
@register_opt()
@local_optimizer([])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(conv)
"""
def GpuConvOp_from_ConvOp(op):
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=op.imshp_logical[1:3],
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned
)
#HACK to print the number of MFlops in the profiler output.
if hasattr(op,'flops'):
ret.flops=op.flops
return ret
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
img, kern = host_input.owner.inputs
return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
if isinstance(node.op, theano.sandbox.conv.ConvOp):
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
import theano.sandbox.downsample
@register_opt()
@local_optimizer([])
def local_gpu_downsample_factor_max(node):
if isinstance(node.op, theano.sandbox.downsample.DownsampleFactorMax):
x, = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
@register_opt()
@local_optimizer([])
def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, theano.sandbox.downsample.DownsampleFactorMaxGrad):
x,z,gz = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0], gpu_from_host(z), gpu_from_host(gz)))]
import sys, time
import theano, theano.sandbox.conv
import theano
from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc
from theano import tensor
import theano.tensor.nnet
import theano.sandbox.conv
import theano.sandbox.downsample
import numpy
import theano_cuda_ndarray as tcn
......@@ -251,8 +255,10 @@ def run_conv_nnet2_classif(shared_fn, isize, ksize, n_batch=60, n_iter=25):
conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = theano.sandbox.conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0)+b0)
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
ds_op = theano.sandbox.downsample.DownsampleFactorMax((2,2), ignore_border=False)
hid = tensor.tanh(ds_op(conv_op(x, w0)+b0))
hid1 = tensor.tanh(conv_op1(hid, w1) + b1)
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论