提交 b3528941 authored 作者: James Bergstra's avatar James Bergstra

profiling a basic nnet

上级 0f60bf1a
...@@ -131,7 +131,7 @@ class GpuElemwise(Op): ...@@ -131,7 +131,7 @@ class GpuElemwise(Op):
def c_src_kernel(self, node, nodename): def c_src_kernel(self, node, nodename):
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
sio = StringIO.StringIO() sio = StringIO.StringIO()
print 'C_SRC_KERNEL', sio.getvalue() #print 'C_SRC_KERNEL', sio.getvalue()
def _logical_scalar(x): def _logical_scalar(x):
return all(x.type.broadcastable) return all(x.type.broadcastable)
...@@ -202,7 +202,7 @@ class GpuElemwise(Op): ...@@ -202,7 +202,7 @@ class GpuElemwise(Op):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}" print >> sio, "}"
print sio.getvalue() #print sio.getvalue()
return sio.getvalue() return sio.getvalue()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -582,3 +582,25 @@ class GpuDimShuffle(Op): ...@@ -582,3 +582,25 @@ class GpuDimShuffle(Op):
return sio.getvalue() return sio.getvalue()
class GpuSum(Op):
def __init__(self, reduce_mask):
self.reduce_mask = tuple(reduce_mask)
def __eq__(self, other):
return type(self) == type(other) and self.reduce_mask == other.reduce_mask
def __hash__(self):
return hash(type(self)) ^ hash(self.reduce_mask)
def __str__(self):
return "GpuSum{%s}" % str(self.reduce_mask)
def make_node(self, x):
if (x.type.ndim != len(self.reduce_mask)):
raise TypeError("x must have rank %i"%len(self.reduce_mask))
o_broadcast = [x.type.broadcastable[i] for i in xrange(x.type.ndim) if not self.reduce_mask[i]]
return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])
def perform(self, node, (x,), (z,)):
z[0] = x.reduce_sum(self.reduce_mask)
...@@ -3,6 +3,8 @@ from theano import tensor, scalar ...@@ -3,6 +3,8 @@ from theano import tensor, scalar
import StringIO import StringIO
class GpuDot22(Op): class GpuDot22(Op):
def __str__(self):
return 'GpuDot22'
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
...@@ -67,6 +69,8 @@ gpu_dot22 = GpuDot22() ...@@ -67,6 +69,8 @@ gpu_dot22 = GpuDot22()
class GpuGemm(Op): class GpuGemm(Op):
destroy_map = {0:[0]} destroy_map = {0:[0]}
def __str__(self):
return 'GpuGemm'
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
......
import sys
from theano import tensor, scalar, compile from theano import tensor, scalar, compile
from theano.gof import local_optimizer, EquilibriumDB from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
from .basic_ops import * from .basic_ops import *
from .blas import gpu_dot22, gpu_gemm from .blas import gpu_dot22, gpu_gemm
...@@ -8,8 +9,12 @@ from theano.compile import optdb ...@@ -8,8 +9,12 @@ from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...) #optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1, 'fast_run', 'inplace')
gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'inplace')
optdb.register('gpu', optdb.register('gpu',
gpu_optimizer, gpu_seqopt,
optdb.__priority__.get('inplace_opt', 75) + 5, optdb.__priority__.get('inplace_opt', 75) + 5,
'fast_run', 'fast_run',
'inplace') 'inplace')
...@@ -21,25 +26,23 @@ def register_opt(*tags, **kwargs): ...@@ -21,25 +26,23 @@ def register_opt(*tags, **kwargs):
return local_opt return local_opt
return f return f
@register_opt() @local_optimizer([])
@local_optimizer([GpuFromHost(), None]) def local_cut_gpu_host_gpu(node):
def local_gpu_host_gpu(node): if tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()):
if not tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()): return [node.inputs[0].owner.inputs[0]]
return False if tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
return False
@register_opt() gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu, 'fast_run', 'inplace', 'gpu')
@local_optimizer([HostFromGpu(), None])
def local_host_gpu_host(node):
if not tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
return False
return [node.inputs[0].owner.inputs[0]]
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_elemwise_0(node): def local_gpu_elemwise_0(node):
if isinstance(node.op, tensor.Elemwise): if isinstance(node.op, tensor.Elemwise):
if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs): if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs):
if any(o.type.dtype == 'float64' for o in node.outputs):
print 'EXITING FROM local_gpu_elemwise_0', node
sys.exit()
# move the add to a GpuAdd # move the add to a GpuAdd
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern) new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))] return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))]
...@@ -132,3 +135,21 @@ def local_gpu_gemm(node): ...@@ -132,3 +135,21 @@ def local_gpu_gemm(node):
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))] return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_sum(node):
if isinstance(node.op, tensor.elemwise.CAReduce):
if node.op.scalar_op == scalar.add:
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if node.op.axis is None:
reduce_mask = [1] * x.type.ndim
else:
reduce_mask = [0] * x.type.ndim
for a in node.op.axis:
assert reduce_mask[a] == 0
reduce_mask[a] = 1
return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
return False
import sys, time
import theano
from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc
from theano import tensor
import numpy
import theano_cuda_ndarray as tcn
import logging
logging.getLogger('theano.gradient').setLevel(logging.INFO)
def run_nnet(use_gpu):
n_batch = 16
n_in = 1024
n_hid = 2048
n_out = 10
if use_gpu:
w = tcn.shared_constructor(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(numpy.zeros(n_hid), 'b')
v = tcn.shared_constructor(numpy.zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(numpy.zeros(n_out), 'c')
else:
w = shared(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
b = shared(numpy.zeros(n_hid), 'b')
v = shared(numpy.zeros((n_hid, n_out)), 'c')
c = shared(numpy.zeros(n_out), 'c')
x = tensor.fmatrix('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
hid = tensor.tanh(tensor.dot(x, w)+b)
out = tensor.tanh(tensor.dot(hid, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w, b, v, c]
gparams = tensor.grad(loss, params)
mode = theano.compile.ProfileMode()
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = numpy.asarray(numpy.random.rand(n_batch, n_in), dtype='float32')
yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
lr = numpy.asarray(0.01, dtype='float32')
for i in xrange(100):
train(xval, yval, lr)
mode.print_summary()
def test_nnet_cpu():
run_nnet(False)
def test_nnet_gpu():
run_nnet(True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论