提交 b457bb54 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1800 from abergeron/gpuarray_ger

Add GpuGer to gpuarray.
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.tensor.blas import Dot22, Gemv, Gemm from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable) from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
try: try:
...@@ -28,7 +28,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -28,7 +28,7 @@ class GpuGemv(BlasOp, Gemv):
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
assert A.dtype == x.dtype == y.dtype == alpha.dtype == beta.dtype assert A.dtype == x.dtype == y.dtype
return Apply(self, [y, alpha, A, x, beta], [y.type()]) return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage): def perform(self, node, inputs, out_storage):
...@@ -45,8 +45,15 @@ class GpuGemv(BlasOp, Gemv): ...@@ -45,8 +45,15 @@ class GpuGemv(BlasOp, Gemv):
if self.inplace: if self.inplace:
code = """ code = """
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = %(y)s; if (%(y)s->ga.strides[0] <= 0) {
Py_INCREF(%(out)s); %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
} else {
%(out)s = %(y)s;
Py_INCREF(%(out)s);
}
""" % vars """ % vars
else: else:
code = """ code = """
...@@ -72,7 +79,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -72,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
gpugemv_no_inplace = GpuGemv(inplace=False) gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True) gpugemv_inplace = GpuGemv(inplace=True)
...@@ -84,7 +91,7 @@ class GpuGemm(BlasOp, Gemm): ...@@ -84,7 +91,7 @@ class GpuGemm(BlasOp, Gemm):
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B) B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C) C = as_gpuarray_variable(C)
assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype assert A.dtype == B.dtype == C.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()]) return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
...@@ -101,8 +108,15 @@ class GpuGemm(BlasOp, Gemm): ...@@ -101,8 +108,15 @@ class GpuGemm(BlasOp, Gemm):
if self.inplace: if self.inplace:
code = """ code = """
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = %(C)s; if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
Py_INCREF(%(out)s); %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
} else {
%(out)s = %(C)s;
Py_INCREF(%(out)s);
}
""" % vars """ % vars
else: else:
code = """ code = """
...@@ -128,13 +142,74 @@ class GpuGemm(BlasOp, Gemm): ...@@ -128,13 +142,74 @@ class GpuGemm(BlasOp, Gemm):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
gpugemm_no_inplace = GpuGemm(inplace=False) gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True) gpugemm_inplace = GpuGemm(inplace=True)
class GpuGer(BlasOp, Ger):
def make_node(self, A, alpha, x, y):
res = Ger.make_node(self, A, alpha, x, y)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
assert A.dtype == x.dtype == y.dtype
return Apply(self, [A, alpha, x, y], [A.type()])
def perform(self, node, inp, out):
A, alpha, x, y = inp
inplace = self.destructive
if inplace and not A.flags.forc:
inplace = False
outputs[0][0] = blas.ger(alpha, x, y, A,
overwrite_a=inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
fail=sub['fail'], name=name)
if self.destructive:
code = """
Py_XDECREF(%(out)s);
if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
%(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
} else {
%(out)s = %(A)s;
Py_INCREF(%(out)s);
}
""" % vars
else:
code = """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(x)s, %(y)s, %(out)s, 0) == -1) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (1,)
gpuger_no_inplace = GpuGer(destructive=False)
gpuger_inplace = GpuGer(destructive=True)
class GpuDot22(BlasOp, Dot22): class GpuDot22(BlasOp, Dot22):
def make_node(self, x, y): def make_node(self, x, y):
res = Dot22.make_node(self, x, y) res = Dot22.make_node(self, x, y)
...@@ -211,8 +286,13 @@ def local_inplace_gpuagemm(node): ...@@ -211,8 +286,13 @@ def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace: if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)] return [gpugemm_inplace(*node.inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True)
def local_inplace_gpuager(node):
if node.op == gpuger_no_inplace:
return [gpuger_inplace(*node.inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup( gpuablas_opt_inplace = in2out(LocalOptGroup(
local_inplace_gpuagemv, local_inplace_gpuagemm), local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
name='gpuablas_opt_inplace') name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt', optdb.register('InplaceGpuaBlasOpt',
gpuablas_opt_inplace, gpuablas_opt_inplace,
......
...@@ -17,7 +17,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, ...@@ -17,7 +17,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
GpuAlloc, GpuAlloc,
GpuReshape, GpuReshape,
GpuEye) GpuEye)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from theano.sandbox.gpuarray.conv import GpuConv from theano.sandbox.gpuarray.conv import GpuConv
from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
...@@ -302,23 +302,23 @@ def local_gpua_careduce(node): ...@@ -302,23 +302,23 @@ def local_gpua_careduce(node):
@register_opt() @register_opt()
@op_lifter([tensor.blas.Gemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node): def local_gpua_gemv(node):
return GpuGemv(inplace=node.op.inplace) return GpuGemv(inplace=node.op.inplace)
@register_opt()
@op_lifter([tensor.blas_c.CGemv])
def local_gpua_gemv2(node):
return GpuGemv(inplace=node.op.inplace)
@register_opt() @register_opt()
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node): def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt()
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node):
return GpuGer(destructive=node.op.destructive)
@register_opt() @register_opt()
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node): def local_gpua_dot22(node):
......
from unittest import TestCase from unittest import TestCase
from nose.plugins.skip import SkipTest
import theano import theano
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22 from theano import tensor
from theano.tests import unittest_tools
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand from theano.sandbox.gpuarray import gpuarray_shared_constructor
from theano.sandbox.gpuarray.tests.test_basic_ops import (makeTester, rand,
mode_with_gpu)
from theano.sandbox.gpuarray.blas import (gpugemv_inplace, from theano.sandbox.gpuarray.blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpu_dot22) gpugemm_inplace, gpugemm_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22)
GpuGemvTester = makeTester('GpuGemvTester', GpuGemvTester = makeTester('GpuGemvTester',
...@@ -21,6 +30,21 @@ GpuGemvTester = makeTester('GpuGemvTester', ...@@ -21,6 +30,21 @@ GpuGemvTester = makeTester('GpuGemvTester',
) )
) )
class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
gemv = gpugemv_no_inplace
gemv_inplace = gpugemv_inplace
@staticmethod
def shared(val):
try:
return gpuarray_shared_constructor(val)
except TypeError:
return theano.shared(val)
GpuGemmTester = makeTester('GpuGemmTester', GpuGemmTester = makeTester('GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace, op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict( cases=dict(
...@@ -37,9 +61,40 @@ GpuGemmTester = makeTester('GpuGemmTester', ...@@ -37,9 +61,40 @@ GpuGemmTester = makeTester('GpuGemmTester',
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1], # test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1], # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1], # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
) )
) )
class TestGpuSger(TestGer):
def setUp(self):
self.mode = mode_with_gpu
dtype = self.dtype = 'float32' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.ger_destructive = gpuger_inplace
# data on the gpu make the op always inplace
self.ger = gpuger_inplace
self.gemm = gpugemm_inplace
def test_f32_0_0(self):
raise SkipTest('0-sized objects not supported')
def test_f32_1_0(self):
raise SkipTest('0-sized objects not supported')
def test_f32_0_1(self):
raise SkipTest('0-sized objects not supported')
class TestGpuSgerNoTransfer(TestGpuSger):
shared = staticmethod(gpuarray_shared_constructor)
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
def setUp(self):
self.ops = [gpuger_no_inplace, gpuger_inplace]
def clone(self, op):
return GpuGer(destructive=op.destructive)
GpuDot22Tester = makeTester( GpuDot22Tester = makeTester(
'GpuGemmTester', 'GpuGemmTester',
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论