提交 0f60bf1a authored 作者: James Bergstra's avatar James Bergstra

gemm and dot added

上级 31afc498
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar
import StringIO
class GpuDot22(Op): class GpuDot22(Op):
pass def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, x, y):
if x.type.ndim != 2:
raise TypeError(x)
if y.type.ndim != 2:
raise TypeError(y)
return Apply(self, [x,y], [x.type()])
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, inputs, outputs, sub):
x, y = inputs
z, = outputs
fail = sub['fail']
return """
if (cnda_%(x)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", cnda_%(x)s->nd);
%(fail)s;
}
if (cnda_%(y)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", cnda_%(y)s->nd);
%(fail)s;
}
if ((NULL == cnda_%(z)s)
|| (cnda_%(z)s->dim[0] != cnda_%(x)s->dim[0])
|| (cnda_%(z)s->dim[1] != cnda_%(y)s->dim[1]))
{
if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
npy_intp dims[2];
dims[0] = cnda_%(x)s->dim[0];
dims[1] = cnda_%(y)s->dim[1];
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims))
{
if (cnda_%(z)s)
{
Py_DECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
}
%(fail)s;
}
}
if (CudaNdarray_gemm(1.0f, cnda_%(x)s, cnda_%(y)s, 0.0f, cnda_%(z)s))
{
if (cnda_%(z)s)
{
Py_DECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
}
%(fail)s;
}
""" % locals()
gpu_dot22 = GpuDot22()
class GpuGemm(Op): class GpuGemm(Op):
pass destroy_map = {0:[0]}
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemm is assumed to already
# have been done
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return ()
def c_code(self, node, name, inputs, outputs, sub):
z_in, a, x, y, b = inputs
z_out, = outputs
fail = sub['fail']
return """
#define REAL float
float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
? (REAL)(((float*)%(a)s->data)[0])
: (REAL)(((double*)%(a)s->data)[0]);
float %(name)s_b = (%(b)s->descr->type_num == PyArray_FLOAT) ?
(REAL)(((float*)%(b)s->data)[0])
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
if (CudaNdarray_gemm(%(name)s_a, cnda_%(x)s, cnda_%(y)s, %(name)s_b, cnda_%(z_in)s))
{
%(fail)s;
}
cnda_%(z_out)s = cnda_%(z_in)s;
Py_INCREF(cnda_%(z_out)s);
""" % locals()
gpu_gemm = GpuGemm()
from theano import tensor, gof from theano import tensor, scalar, compile
from theano import tensor, scalar from theano.gof import local_optimizer, EquilibriumDB
from .basic_ops import * from .basic_ops import *
from .blas import gpu_dot22, gpu_gemm
@gof.local_optimizer([GpuFromHost(), None]) from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
gpu_optimizer = EquilibriumDB()
optdb.register('gpu',
gpu_optimizer,
optdb.__priority__.get('inplace_opt', 75) + 5,
'fast_run',
'inplace')
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
gpu_optimizer.register(name, local_opt, 'fast_run', 'inplace', *tags)
return local_opt
return f
@register_opt()
@local_optimizer([GpuFromHost(), None])
def local_gpu_host_gpu(node): def local_gpu_host_gpu(node):
if not tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()): if not tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()):
return False return False
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
tensor.opt.register_specialize(local_gpu_host_gpu, 'gpu')
@gof.local_optimizer([HostFromGpu(), None]) @register_opt()
@local_optimizer([HostFromGpu(), None])
def local_host_gpu_host(node): def local_host_gpu_host(node):
if not tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()): if not tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
return False return False
return [node.inputs[0].owner.inputs[0]] return [node.inputs[0].owner.inputs[0]]
tensor.opt.register_specialize(local_host_gpu_host, 'gpu')
@gof.local_optimizer([]) @register_opt()
@local_optimizer([])
def local_gpu_elemwise_0(node): def local_gpu_elemwise_0(node):
if isinstance(node.op, tensor.Elemwise): if isinstance(node.op, tensor.Elemwise):
if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs): if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs):
...@@ -25,8 +44,9 @@ def local_gpu_elemwise_0(node): ...@@ -25,8 +44,9 @@ def local_gpu_elemwise_0(node):
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern) new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))] return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))]
return False return False
tensor.opt.register_specialize(local_gpu_elemwise_0, 'gpu')
@gof.local_optimizer([]) @register_opt()
@local_optimizer([])
def local_gpu_elemwise_1(node): def local_gpu_elemwise_1(node):
""" """
gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...)) gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
...@@ -38,9 +58,9 @@ def local_gpu_elemwise_1(node): ...@@ -38,9 +58,9 @@ def local_gpu_elemwise_1(node):
new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern) new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern)
return [new_op(*(gpu_from_host(i) for i in elemwise_node.inputs))] return [new_op(*(gpu_from_host(i) for i in elemwise_node.inputs))]
return False return False
tensor.opt.register_specialize(local_gpu_elemwise_1, 'gpu')
@gof.local_optimizer([]) @register_opt()
@local_optimizer([])
def local_gpu_dimshuffle_0(node): def local_gpu_dimshuffle_0(node):
""" """
dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle) dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
...@@ -56,9 +76,9 @@ def local_gpu_dimshuffle_0(node): ...@@ -56,9 +76,9 @@ def local_gpu_dimshuffle_0(node):
else: else:
return [host_from_gpu(new_op(gpu_from_host(tensor.tensor_copy(input))))] return [host_from_gpu(new_op(gpu_from_host(tensor.tensor_copy(input))))]
return False return False
tensor.opt.register_specialize(local_gpu_dimshuffle_0, 'gpu')
@gof.local_optimizer([]) @register_opt()
@local_optimizer([])
def local_gpu_dimshuffle_1(node): def local_gpu_dimshuffle_1(node):
""" """
gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host) gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host)
...@@ -71,5 +91,44 @@ def local_gpu_dimshuffle_1(node): ...@@ -71,5 +91,44 @@ def local_gpu_dimshuffle_1(node):
dimshuffle_node.op.new_order) dimshuffle_node.op.new_order)
return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))] return [new_op(gpu_from_host(dimshuffle_node.inputs[0]))]
return False return False
tensor.opt.register_specialize(local_gpu_dimshuffle_1, 'gpu')
@register_opt()
@local_optimizer([])
def local_gpu_dot(node):
"""
gpu_from_host(dot) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot)
"""
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas._dot22:
x, y = host_input.owner.inputs
return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
if node.op == tensor.blas._dot22:
if any((i.owner and i.owner.op == host_from_gpu) for i in node.inputs):
x, y = node.inputs
return [host_from_gpu(gpu_dot22(gpu_from_host(x), gpu_from_host(y)))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_gemm(node):
"""
gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
"""
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas.gemm:
z, a, x, y, b = host_input.owner.inputs
return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
if node.op == tensor.blas.gemm:
z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False
import sys, time
from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc
from theano import tensor
import numpy
import theano_cuda_ndarray as tcn
def test_dot():
a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, tensor.dot(a,b))])
a0 = a.value * 1.0
print a0
for i, node in enumerate(f.maker.env.toposort()):
print i, node
bval = numpy.random.rand(4,4)
f(bval)
print a.value
assert numpy.allclose(numpy.dot(a0, bval), a.value)
def test_gemm():
a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
b = tensor.fmatrix('b')
c = tensor.fmatrix('c')
f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))])
a0 = a.value * 1.0
print a0
for i, node in enumerate(f.maker.env.toposort()):
print i, node
bval = numpy.random.rand(4,4)
cval = numpy.random.rand(4,4)
f(bval,cval)
print a.value
assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.value)
...@@ -56,4 +56,7 @@ def cmp_sigmoids_T(shape): ...@@ -56,4 +56,7 @@ def cmp_sigmoids_T(shape):
if __name__ == '__main__': if __name__ == '__main__':
eval(sys.argv[1]) eval(sys.argv[1])
#cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论