提交 cee77d13 authored 作者: James Bergstra's avatar James Bergstra

added maxpooling op GpuDownsampleMax

上级 3272fee0
...@@ -187,3 +187,238 @@ class GpuConv(Op): ...@@ -187,3 +187,238 @@ class GpuConv(Op):
kern_align=self.logical_kern_align_top, kern_align=self.logical_kern_align_top,
verbose=0) verbose=0)
from theano.sandbox.downsample import DownsampleFactorMax
class GpuDownsampleFactorMax(DownsampleFactorMax):
# inherit __eq__, __hash__, __str__
def make_node(self, x):
return Apply(self, [x], [x.type()])
def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail']
ds0, ds1 = self.ds
ignore_border = int(self.ignore_border)
return """
int dims[4], xdim2, xdim3;
if (cnda_%(x)s->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
xdim2 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2];
xdim3 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[3];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[1];
dims[2] = xdim2 / %(ds0)s;
dims[3] = xdim3 / %(ds1)s;
if (! %(ignore_border)s)
{
dims[2] += (xdim2%%(%(ds0)s)?1:0);
dims[3] += (xdim3%%(%(ds1)s)?1:0);
}
if ((NULL == cnda_%(z)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != dims[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != dims[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[2] != dims[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[3] != dims[3]))
{
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s)
|| CudaNdarray_alloc_contiguous(cnda_%(z)s, 4, dims))
{
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
%(fail)s;
}
}
{
dim3 grid(dims[0] * dims[1], dims[2]);
//dim3 block(std::min(dims[3], 512)); //TODO: implement this by supporting more
//outputs than threads
dim3 block(dims[3]);
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kMaxPool_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
""" % locals()
def c_support_code_apply(self, node, nodename):
ignore_border = int(self.ignore_border)
return """
template<int pf2, int pf3>
__global__ void kMaxPool_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3,
float *z)
{
float cur_max, cur_x;
int i0 = blockIdx.x / D0;
int i1 = blockIdx.x %% D0;
int i2 = blockIdx.y;
extern __shared__ float xbuf[]; //size [xD3]
for (int r2 = 0; (r2 < pf2) && (%(ignore_border)s || (r2 + i2*pf2 < xD2)); ++r2)
{
__syncthreads();
// load the current row of the image into shared memory
for (int i3 = threadIdx.x; i3 < xD3; i3 += blockDim.x)
{
xbuf[i3] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + i3*xS3];
}
__syncthreads();
// initialize our max if this is the first row we're loading
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements in the current row
for (int k = 0; k < pf3; ++k)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_max = (cur_x < cur_max) ? cur_x : cur_max;
}
}
//store the result to global memory
z[i0 * D1*D2*D3 + i1*D2*D3 + i2*D3 + threadIdx.x] = cur_max;
}
""" % locals()
from theano.sandbox.downsample import DownsampleFactorMaxGrad
class GpuDownsampleFactorMaxGrad(DownsampleFactorMaxGrad):
# inherit __eq__, __hash__, __str__
def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()])
def perform(self, node, input_storage, output_storage):
raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (x, z, gz), (gx,), sub):
fail = sub['fail']
ds0, ds1 = self.ds
ignore_border = int(self.ignore_border)
return """
if (cnda_%(x)s->nd != 4
|| cnda_%(z)s->nd != 4
|| cnda_%(gz)s->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == cnda_%(gx)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[2] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[3] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]))
{
Py_XDECREF(cnda_%(gx)s);
cnda_%(gx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(gx)s)
|| CudaNdarray_alloc_contiguous(cnda_%(gx)s, 4, CudaNdarray_HOST_DIMS(cnda_%(x)s)))
{
Py_XDECREF(cnda_%(gx)s);
cnda_%(gx)s = NULL;
%(fail)s;
}
}
{
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
//TODO: implement this by supporting more
//outputs than threads
dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(cnda_%(z)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[3],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gz)s),
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gx)s));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kDownsampleMaxGrad_%(nodename)s",
cudaGetErrorString(err),
grid.x,
grid.y,
block.x,
block.y,
block.z);
%(fail)s;
}
}
""" % locals()
def c_support_code_apply(self, node, nodename):
ignore_border = int(self.ignore_border)
return """
template<int ds0, int ds1>
__global__ void kDownsampleMaxGrad_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3,
const float * z, int zS0, int zS1, int zS2, int zS3,
const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
float *gx)
{
float cur_max, cur_x, my_z, my_gz;
int i0 = blockIdx.x;
int i1 = 0;
int i2 = blockIdx.y; // row wrt z and/or gz
int x_col = threadIdx.x;
// The algorithm here is that every thread writes one output pixel per line
for (i1 = 0; i1 < D1; ++i1)
{
if (%(ignore_border)s && (x_col >= ds1 * D3))
{
my_gz = 0;
}
else
{
my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + (x_col/ds1)*gzS3];
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + (x_col/ds1)* zS3];
}
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (%(ignore_border)s || (x_row < xD2)); ++x_row)
{
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col]) ? my_gz : 0;
}
}
}
""" % locals()
...@@ -4,6 +4,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB ...@@ -4,6 +4,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from theano_cuda_ndarray.basic_ops import * from theano_cuda_ndarray.basic_ops import *
from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv
from theano_cuda_ndarray.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano_cuda_ndarray.nnet import ( from theano_cuda_ndarray.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx) GpuCrossentropySoftmax1HotWithBiasDx)
...@@ -148,42 +149,6 @@ def local_gpu_sum(node): ...@@ -148,42 +149,6 @@ def local_gpu_sum(node):
return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))] return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
return False return False
import theano.sandbox.conv
@register_opt()
@local_optimizer([])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(conv)
"""
def GpuConvOp_from_ConvOp(op):
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=op.imshp_logical[1:3],
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned
)
#HACK to print the number of MFlops in the profiler output.
if hasattr(op,'flops'):
ret.flops=op.flops
return ret
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
img, kern = host_input.owner.inputs
return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
if isinstance(node.op, theano.sandbox.conv.ConvOp):
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_reshape(node): def local_gpu_reshape(node):
...@@ -265,3 +230,61 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node): ...@@ -265,3 +230,61 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
gpu_from_host(cast(yidx, 'float32'))) gpu_from_host(cast(yidx, 'float32')))
return [host_from_gpu(gpu_dx)] return [host_from_gpu(gpu_dx)]
return False return False
#### Convolution, maxpooling
import theano.sandbox.conv
@register_opt()
@local_optimizer([])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(conv)
"""
def GpuConvOp_from_ConvOp(op):
ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=op.imshp_logical[1:3],
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned
)
#HACK to print the number of MFlops in the profiler output.
if hasattr(op,'flops'):
ret.flops=op.flops
return ret
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
img, kern = host_input.owner.inputs
return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
if isinstance(node.op, theano.sandbox.conv.ConvOp):
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
import theano.sandbox.downsample
@register_opt()
@local_optimizer([])
def local_gpu_downsample_factor_max(node):
if isinstance(node.op, theano.sandbox.downsample.DownsampleFactorMax):
x, = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
@register_opt()
@local_optimizer([])
def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, theano.sandbox.downsample.DownsampleFactorMaxGrad):
x,z,gz = node.inputs
if (x.owner and x.owner.op == host_from_gpu):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds, node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0], gpu_from_host(z), gpu_from_host(gz)))]
import sys, time import sys, time
import theano, theano.sandbox.conv import theano
from theano.compile.sandbox.sharedvalue import shared from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc from theano.compile.sandbox.pfunc import pfunc
from theano import tensor from theano import tensor
import theano.tensor.nnet import theano.tensor.nnet
import theano.sandbox.conv
import theano.sandbox.downsample
import numpy import numpy
import theano_cuda_ndarray as tcn import theano_cuda_ndarray as tcn
...@@ -251,8 +255,10 @@ def run_conv_nnet2_classif(shared_fn, isize, ksize, n_batch=60, n_iter=25): ...@@ -251,8 +255,10 @@ def run_conv_nnet2_classif(shared_fn, isize, ksize, n_batch=60, n_iter=25):
conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1) conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = theano.sandbox.conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1) conv_op1 = theano.sandbox.conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0)+b0) ds_op = theano.sandbox.downsample.DownsampleFactorMax((2,2), ignore_border=False)
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
hid = tensor.tanh(ds_op(conv_op(x, w0)+b0))
hid1 = tensor.tanh(conv_op1(hid, w1) + b1)
hid_flat = hid1.reshape((n_batch, n_hid)) hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c) out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr) loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论