提交 e7f39cef authored 作者: Frederic Bastien's avatar Frederic Bastien

added op GpuDot22Scalar, with opt to insert it and tests.

上级 fb83675c
...@@ -71,6 +71,80 @@ class GpuDot22(Op): ...@@ -71,6 +71,80 @@ class GpuDot22(Op):
""" % locals() """ % locals()
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
class GpuDot22Scalar(Op):
def __str__(self):
return 'GpuDot22Scalar'
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, x, y, a):
if x.type.ndim != 2:
raise TypeError(x)
if y.type.ndim != 2:
raise TypeError(y)
if not tensor.blas._as_scalar(a):
raise TypeError(a)
return Apply(self, [x,y,a], [x.type()])
def c_code_cache_version(self):
return (1,0)
def c_code(self, node, name, inputs, outputs, sub):
x, y, a = inputs
z, = outputs
fail = sub['fail']
return """
#define REAL float
float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
? (REAL)(((float*)%(a)s->data)[0])
: (REAL)(((double*)%(a)s->data)[0]);
#undef REAL
if (%(x)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", %(x)s->nd);
%(fail)s;
}
if (%(y)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", %(y)s->nd);
%(fail)s;
}
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(y)s)[1]))
{
//if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1];
%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(z)s) || CudaNdarray_alloc_contiguous(%(z)s, 2, dims))
{
if (%(z)s)
{
Py_DECREF(%(z)s);
%(z)s = NULL;
}
%(fail)s;
}
}
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, 0.0f, %(z)s))
{
if (%(z)s)
{
Py_DECREF(%(z)s);
%(z)s = NULL;
}
%(fail)s;
}
""" % locals()
gpu_dot22scalar = GpuDot22Scalar()
class GpuGemm(Op): class GpuGemm(Op):
destroy_map = {0:[0]} destroy_map = {0:[0]}
def __str__(self): def __str__(self):
......
...@@ -6,7 +6,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to ...@@ -6,7 +6,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to
from theano.sandbox.cuda.basic_ops import * from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import gpu_dot22, gpu_gemm, GpuConv from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv
from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
...@@ -143,11 +143,11 @@ def local_gpu_dimshuffle_0(node): ...@@ -143,11 +143,11 @@ def local_gpu_dimshuffle_0(node):
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_dot(node): def local_gpu_dot22(node):
""" """
gpu_from_host(dot) -> gpudot(gpu_from_host) gpu_from_host(dot22) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot) dot(host_from_gpu) -> host_from_gpu(gpudot22)
""" """
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -160,6 +160,25 @@ def local_gpu_dot(node): ...@@ -160,6 +160,25 @@ def local_gpu_dot(node):
return [host_from_gpu(gpu_dot22(gpu_from_host(x), gpu_from_host(y)))] return [host_from_gpu(gpu_dot22(gpu_from_host(x), gpu_from_host(y)))]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_dot22scalar(node):
"""
gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
"""
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas._dot22scalar:
x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y), tensor.blas._as_scalar(scalar))]
if node.op == tensor.blas._dot22scalar:
if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]):
x, y, scalar = node.inputs
return [host_from_gpu(gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),tensor.blas._as_scalar(scalar)))]
return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemm(node): def local_gpu_gemm(node):
......
...@@ -44,6 +44,33 @@ def test_dot(): ...@@ -44,6 +44,33 @@ def test_dot():
assert numpy.allclose(numpy.dot(a0, bval), a.value) assert numpy.allclose(numpy.dot(a0, bval), a.value)
def test_dot22scalar():
a = tensor.fmatrix()
b = tensor.fmatrix()
scalar = tensor.fscalar()
av = my_rand(4,4)
bv = my_rand(4,4)
f = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'), mode=mode_with_gpu)
f2 = theano.function([a,b], tensor.dot(a,b)*numpy.asarray(4, 'float32'))
t=f.maker.env.toposort()
assert len(t)==4
assert isinstance(t[0].op,tcn.GpuFromHost)
assert isinstance(t[1].op,tcn.GpuFromHost)
assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
assert isinstance(t[3].op,tcn.HostFromGpu)
assert numpy.allclose(f(av,bv),f2(av,bv))
f = theano.function([a,b,scalar], tensor.dot(a,b)*scalar, mode=mode_with_gpu)
f2 = theano.function([a,b,scalar], tensor.dot(a,b)*scalar)
t=f.maker.env.toposort()
assert len(t)==4
assert isinstance(t[0].op,tcn.GpuFromHost)
assert isinstance(t[1].op,tcn.GpuFromHost)
assert isinstance(t[2].op,tcn.blas.GpuDot22Scalar)
assert isinstance(t[3].op,tcn.HostFromGpu)
assert numpy.allclose(f(av,bv,0.5),f2(av,bv,0.5))
def test_gemm(): def test_gemm():
a = tcn.shared_constructor(my_rand(4,4), 'a') a = tcn.shared_constructor(my_rand(4,4), 'a')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论