提交 71338c0b authored 作者: James Bergstra's avatar James Bergstra

nnet_conv1 passes with all computations on GPU

上级 9c146bd8
......@@ -609,3 +609,124 @@ class GpuSum(Op):
def perform(self, node, (x,), (z,)):
z[0] = x.reduce_sum(self.reduce_mask)
class GpuReshape(tensor.Reshape):
def make_node(self, x, shp):
return Apply(self, [x, shp], [CudaNdarrayType([False]*self.ndim)()])
def perform(self, node, (x, shp), (out,)):
if (len(shp) != self.ndim):
raise ValueError('shape argument to Reshape.perform has incorrect length %i'
', should be %i' % (len(shp), self.ndim), shp)
out[0] = x.reshape(tuple(shp))
class GpuDimFlip(Op):
"""This Op implements a very special case of Subtensor, in which some (or all) of the
strides are negated.
This Op should be erased when a proper GpuSubtensor is implemented.
"""
def __init__(self, mask):
Op.__init__(self)
self.mask = mask
def __eq__(self, other):
return type(self) == type(other) and self.mask == other.mask
def __hash__(self):
return hash(type(self)) ^ hash(self.mask)
def __str__(self):
return '%s{%s}' %(self.__class__.__name__, str(self.mask))
def perform(self, node, (x,), (out,)):
z = x.view()
total_dev_data_offset = 0
for i, f in enumerate(self.mask):
if f and z.shape[i] > 1:
dev_data_offset += (z.dim[i] - 1) * z.str[i]
z.str[i] *= -1
z.dev_data += total_dev_data_offset
out[0] = z
class GpuSubtensor(tensor.Subtensor):
def make_node(self, x, *inputs):
rval = tensor.Subtensor.make_node(self, x, *inputs)
rval.inputs[0] = x # clobber the 'astensor'
rval.outputs[0].type = CudaNdarrayType(rval.outputs[0].type.broadcastable)
return rval
def perform(self, node, inputs, (out, )):
indices = list(reversed(inputs[1:]))
def convert(entry):
if isinstance(entry, Type):
return indices.pop()
elif isinstance(entry, slice):
return slice(convert(entry.start),
convert(entry.stop),
convert(entry.step))
else:
return entry
x = inputs[0].view()
out[0] = x
#todo; when this works, put it into CudaNdarray.__getitem__
# (sequence protocol)
x_shape = x.shape
x_strides = x._strides
offset = 0
for i, thing in enumerate(map(convert, self.idx_list)):
if isinstance(thing, int):
#this requires reducing the rank of the
# view....
raise NotImplementedError()
if isinstance(thing, slice):
#stride
if thing.step is None:
stride = 1
else:
stride = thing.step
#start
if thing.start is None:
if stride > 0:
start = 0
else:
start = x_shape[i]-1
else:
if thing.start < 0:
start = x_shape[i] - thing.start
else:
start = thing.start
#stop
if thing.stop is None:
if stride > 0:
stop = x_shape[i]
else:
stop = -1
else:
if thing.stop < 0:
stop = x_shape[i] - thing.stop
else:
stop = thing.stop
newlen = (stop - start) // stride
offset += x_strides[i] * start
x._set_shape_i(i, newlen)
x._set_stride(i, x_strides[i] * stride)
#print 'perform', id(x), x.shape, i, thing
sizeof_float = 4
x._dev_data += offset * sizeof_float
#sys.stdout.flush()
#sys.exit()
class GpuShape(tensor.Shape):
def make_node(self, x):
return Apply(self, [x], [tensor.lvector()])
gpu_shape = GpuShape()
......@@ -2,6 +2,8 @@ from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar
import StringIO
import cuda_ndarray
class GpuDot22(Op):
def __str__(self):
return 'GpuDot22'
......@@ -109,3 +111,76 @@ class GpuGemm(Op):
Py_INCREF(cnda_%(z_out)s);
""" % locals()
gpu_gemm = GpuGemm()
##
# Not really a BLAS operation, but whatever.
#
class GpuConv(Op):
@staticmethod
def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid':
return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
if mode == 'full':
return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
raise ValueError(mode)
def __init__(self, border_mode,
subsample=(1,1),
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True):
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
h,w = logical_img_hw
#TODO: reconsider this... since shapes are not given in constructor,
# maybe a multiplier + offset is a more appropriate way of passing this logical
# grid
self.logical_img_hw = tuple(logical_img_hw)
if logical_kern_hw is not None:
h,w = logical_kern_hw
#TODO: reconsider this... since shapes are not given in constructor,
# maybe a multiplier + offset is a more appropriate way of passing this logical
# grid
self.logical_kern_hw = tuple(logical_kern_hw)
self.logical_kern_align_top = logical_kern_align_top
def __eq__(self, other):
return type(self) == type(other) \
and self.border_mode == other.border_mode \
and self.subsample == other.subsample \
and self.logical_img_hw == other.logical_img_hw \
and self.logical_kern_hw == other.logical_kern_hw \
and self.logical_kern_align_top == other.logical_kern_align_top
def __hash__(self):
return hash(type(self)) \
^ hash(self.border_mode) \
^ hash(self.subsample) \
^ hash(self.logical_img_hw) \
^ hash(self.logical_kern_hw) \
^ hash(self.logical_kern_align_top)
def __str__(self):
return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.logical_img_hw),
str(self.logical_kern_hw),
str(self.logical_kern_align_top))
def make_node(self, img, kern):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if img.type != kern.type:
raise TypeError('img and kern must have same type')
return Apply(self, [img, kern], [img.type()])
def perform(self, node, (img, kern), (out,)):
out[0] = cuda_ndarray.conv(img, kern,
mode=self.border_mode,
subsample=self.subsample,
logical_img_shape=self.logical_img_hw,
logical_kern_shape=self.logical_kern_hw,
kern_align=self.logical_kern_align_top)
......@@ -3,7 +3,7 @@ from theano import tensor, scalar, compile
from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from .basic_ops import *
from .blas import gpu_dot22, gpu_gemm
from .blas import gpu_dot22, gpu_gemm, GpuConv
from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
......@@ -153,3 +153,79 @@ def local_gpu_sum(node):
return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
return False
import theano.sandbox.conv
@register_opt()
@local_optimizer([])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(conv)
"""
def GpuConvOp_from_ConvOp(op):
return GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=op.imshp_logical[1:3],
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned
)
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
img, kern = host_input.owner.inputs
return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
if isinstance(node.op, theano.sandbox.conv.ConvOp):
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
@register_opt()
@local_optimizer([])
def local_gpu_reshape(node):
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op
x, shp = host_input.owner.inputs
return [GpuReshape(rshp.ndim)(gpu_from_host(x), shp)]
if isinstance(node.op, tensor.Reshape):
x, shp = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuReshape(node.op.ndim)(gpu_x, shp))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_subtensor(node):
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Subtensor):
subt = host_input.owner.op
x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0]
coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuSubtensor(node.op.idx_list)(gpu_x, *coords))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_shape(node):
if isinstance(node.op, tensor.Shape):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [gpu_shape(gpu_x)]
return False
import sys, time
import theano
import theano, theano.sandbox.conv
from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc
from theano import tensor
......@@ -58,10 +58,65 @@ def run_nnet(use_gpu):
mode.print_summary()
return rval
def test_nnet_cpu_gpu():
def test_run_nnet():
numpy.random.seed(23456)
rval_cpu = run_nnet(False)
numpy.random.seed(23456)
rval_gpu = run_nnet(True)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet1(shared_fn):
n_batch = 16
n_kern = 20
shape_img = (n_batch, 1, 32, 32)
shape_kern = (n_kern, 1, 5, 5)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32,32),(5,5), 'valid')
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w')
b = shared_fn(numpy.asarray(numpy.zeros((n_kern,1,1)), dtype='float32'), 'b')
v = shared_fn(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared_fn(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,0,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w)+b)
hid_flat = hid.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = theano.compile.ProfileMode()
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
lr = numpy.asarray(0.01, dtype='float32')
for i in xrange(10):
rval = train(xval, yval, lr)
mode.print_summary()
return rval
def test_conv_nnet1():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet1(shared)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet1(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论