提交 71338c0b authored 作者: James Bergstra's avatar James Bergstra

nnet_conv1 passes with all computations on GPU

上级 9c146bd8
...@@ -609,3 +609,124 @@ class GpuSum(Op): ...@@ -609,3 +609,124 @@ class GpuSum(Op):
def perform(self, node, (x,), (z,)): def perform(self, node, (x,), (z,)):
z[0] = x.reduce_sum(self.reduce_mask) z[0] = x.reduce_sum(self.reduce_mask)
class GpuReshape(tensor.Reshape):
def make_node(self, x, shp):
return Apply(self, [x, shp], [CudaNdarrayType([False]*self.ndim)()])
def perform(self, node, (x, shp), (out,)):
if (len(shp) != self.ndim):
raise ValueError('shape argument to Reshape.perform has incorrect length %i'
', should be %i' % (len(shp), self.ndim), shp)
out[0] = x.reshape(tuple(shp))
class GpuDimFlip(Op):
"""This Op implements a very special case of Subtensor, in which some (or all) of the
strides are negated.
This Op should be erased when a proper GpuSubtensor is implemented.
"""
def __init__(self, mask):
Op.__init__(self)
self.mask = mask
def __eq__(self, other):
return type(self) == type(other) and self.mask == other.mask
def __hash__(self):
return hash(type(self)) ^ hash(self.mask)
def __str__(self):
return '%s{%s}' %(self.__class__.__name__, str(self.mask))
def perform(self, node, (x,), (out,)):
z = x.view()
total_dev_data_offset = 0
for i, f in enumerate(self.mask):
if f and z.shape[i] > 1:
dev_data_offset += (z.dim[i] - 1) * z.str[i]
z.str[i] *= -1
z.dev_data += total_dev_data_offset
out[0] = z
class GpuSubtensor(tensor.Subtensor):
def make_node(self, x, *inputs):
rval = tensor.Subtensor.make_node(self, x, *inputs)
rval.inputs[0] = x # clobber the 'astensor'
rval.outputs[0].type = CudaNdarrayType(rval.outputs[0].type.broadcastable)
return rval
def perform(self, node, inputs, (out, )):
indices = list(reversed(inputs[1:]))
def convert(entry):
if isinstance(entry, Type):
return indices.pop()
elif isinstance(entry, slice):
return slice(convert(entry.start),
convert(entry.stop),
convert(entry.step))
else:
return entry
x = inputs[0].view()
out[0] = x
#todo; when this works, put it into CudaNdarray.__getitem__
# (sequence protocol)
x_shape = x.shape
x_strides = x._strides
offset = 0
for i, thing in enumerate(map(convert, self.idx_list)):
if isinstance(thing, int):
#this requires reducing the rank of the
# view....
raise NotImplementedError()
if isinstance(thing, slice):
#stride
if thing.step is None:
stride = 1
else:
stride = thing.step
#start
if thing.start is None:
if stride > 0:
start = 0
else:
start = x_shape[i]-1
else:
if thing.start < 0:
start = x_shape[i] - thing.start
else:
start = thing.start
#stop
if thing.stop is None:
if stride > 0:
stop = x_shape[i]
else:
stop = -1
else:
if thing.stop < 0:
stop = x_shape[i] - thing.stop
else:
stop = thing.stop
newlen = (stop - start) // stride
offset += x_strides[i] * start
x._set_shape_i(i, newlen)
x._set_stride(i, x_strides[i] * stride)
#print 'perform', id(x), x.shape, i, thing
sizeof_float = 4
x._dev_data += offset * sizeof_float
#sys.stdout.flush()
#sys.exit()
class GpuShape(tensor.Shape):
def make_node(self, x):
return Apply(self, [x], [tensor.lvector()])
gpu_shape = GpuShape()
...@@ -2,6 +2,8 @@ from theano import Op, Type, Apply, Variable, Constant ...@@ -2,6 +2,8 @@ from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar from theano import tensor, scalar
import StringIO import StringIO
import cuda_ndarray
class GpuDot22(Op): class GpuDot22(Op):
def __str__(self): def __str__(self):
return 'GpuDot22' return 'GpuDot22'
...@@ -109,3 +111,76 @@ class GpuGemm(Op): ...@@ -109,3 +111,76 @@ class GpuGemm(Op):
Py_INCREF(cnda_%(z_out)s); Py_INCREF(cnda_%(z_out)s);
""" % locals() """ % locals()
gpu_gemm = GpuGemm() gpu_gemm = GpuGemm()
##
# Not really a BLAS operation, but whatever.
#
class GpuConv(Op):
@staticmethod
def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid':
return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
if mode == 'full':
return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
raise ValueError(mode)
def __init__(self, border_mode,
subsample=(1,1),
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True):
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
h,w = logical_img_hw
#TODO: reconsider this... since shapes are not given in constructor,
# maybe a multiplier + offset is a more appropriate way of passing this logical
# grid
self.logical_img_hw = tuple(logical_img_hw)
if logical_kern_hw is not None:
h,w = logical_kern_hw
#TODO: reconsider this... since shapes are not given in constructor,
# maybe a multiplier + offset is a more appropriate way of passing this logical
# grid
self.logical_kern_hw = tuple(logical_kern_hw)
self.logical_kern_align_top = logical_kern_align_top
def __eq__(self, other):
return type(self) == type(other) \
and self.border_mode == other.border_mode \
and self.subsample == other.subsample \
and self.logical_img_hw == other.logical_img_hw \
and self.logical_kern_hw == other.logical_kern_hw \
and self.logical_kern_align_top == other.logical_kern_align_top
def __hash__(self):
return hash(type(self)) \
^ hash(self.border_mode) \
^ hash(self.subsample) \
^ hash(self.logical_img_hw) \
^ hash(self.logical_kern_hw) \
^ hash(self.logical_kern_align_top)
def __str__(self):
return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.logical_img_hw),
str(self.logical_kern_hw),
str(self.logical_kern_align_top))
def make_node(self, img, kern):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if img.type != kern.type:
raise TypeError('img and kern must have same type')
return Apply(self, [img, kern], [img.type()])
def perform(self, node, (img, kern), (out,)):
out[0] = cuda_ndarray.conv(img, kern,
mode=self.border_mode,
subsample=self.subsample,
logical_img_shape=self.logical_img_hw,
logical_kern_shape=self.logical_kern_hw,
kern_align=self.logical_kern_align_top)
...@@ -3,7 +3,7 @@ from theano import tensor, scalar, compile ...@@ -3,7 +3,7 @@ from theano import tensor, scalar, compile
from theano.gof import local_optimizer, EquilibriumDB, SequenceDB from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from .basic_ops import * from .basic_ops import *
from .blas import gpu_dot22, gpu_gemm from .blas import gpu_dot22, gpu_gemm, GpuConv
from theano.compile import optdb from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...) #optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
...@@ -153,3 +153,79 @@ def local_gpu_sum(node): ...@@ -153,3 +153,79 @@ def local_gpu_sum(node):
return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))] return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
return False return False
import theano.sandbox.conv
@register_opt()
@local_optimizer([])
def local_gpu_conv(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(conv)
"""
def GpuConvOp_from_ConvOp(op):
return GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy),
logical_img_hw=op.imshp_logical[1:3],
logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned
)
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
img, kern = host_input.owner.inputs
return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
if isinstance(node.op, theano.sandbox.conv.ConvOp):
img, kern = node.inputs
img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
if img_on_gpu or kern_on_gpu:
gpu_conv = GpuConvOp_from_ConvOp(node.op)
return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
@register_opt()
@local_optimizer([])
def local_gpu_reshape(node):
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op
x, shp = host_input.owner.inputs
return [GpuReshape(rshp.ndim)(gpu_from_host(x), shp)]
if isinstance(node.op, tensor.Reshape):
x, shp = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuReshape(node.op.ndim)(gpu_x, shp))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_subtensor(node):
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Subtensor):
subt = host_input.owner.op
x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0]
coords = node.inputs[1:]
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuSubtensor(node.op.idx_list)(gpu_x, *coords))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_shape(node):
if isinstance(node.op, tensor.Shape):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [gpu_shape(gpu_x)]
return False
import sys, time import sys, time
import theano import theano, theano.sandbox.conv
from theano.compile.sandbox.sharedvalue import shared from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc from theano.compile.sandbox.pfunc import pfunc
from theano import tensor from theano import tensor
...@@ -58,10 +58,65 @@ def run_nnet(use_gpu): ...@@ -58,10 +58,65 @@ def run_nnet(use_gpu):
mode.print_summary() mode.print_summary()
return rval return rval
def test_nnet_cpu_gpu(): def test_run_nnet():
numpy.random.seed(23456) numpy.random.seed(23456)
rval_cpu = run_nnet(False) rval_cpu = run_nnet(False)
numpy.random.seed(23456) numpy.random.seed(23456)
rval_gpu = run_nnet(True) rval_gpu = run_nnet(True)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6) assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet1(shared_fn):
n_batch = 16
n_kern = 20
shape_img = (n_batch, 1, 32, 32)
shape_kern = (n_kern, 1, 5, 5)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32,32),(5,5), 'valid')
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w')
b = shared_fn(numpy.asarray(numpy.zeros((n_kern,1,1)), dtype='float32'), 'b')
v = shared_fn(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared_fn(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,0,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w)+b)
hid_flat = hid.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = theano.compile.ProfileMode()
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
lr = numpy.asarray(0.01, dtype='float32')
for i in xrange(10):
rval = train(xval, yval, lr)
mode.print_summary()
return rval
def test_conv_nnet1():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet1(shared)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet1(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论