提交 0b70cc9e authored 作者: Frederic's avatar Frederic

Implement GpuAlloc(memset_0=True) as in the old back-end.

上级 26941db0
...@@ -447,8 +447,26 @@ cuda_from_gpu = CudaFromGpu() ...@@ -447,8 +447,26 @@ cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster.
"""
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self): def __str__(self):
return 'GpuAlloc' #Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
def make_node(self, value, *shape): def make_node(self, value, *shape):
res = Alloc.make_node(self, value, *shape) res = Alloc.make_node(self, value, *shape)
...@@ -457,6 +475,9 @@ class GpuAlloc(HideC, Alloc): ...@@ -457,6 +475,9 @@ class GpuAlloc(HideC, Alloc):
broadcastable=res.outputs[0].broadcastable) broadcastable=res.outputs[0].broadcastable)
return Apply(self, [value] + res.inputs[1:], [otype()]) return Apply(self, [value] + res.inputs[1:], [otype()])
def c_headers(self):
return ['<compyte/numpy_compat.h>']
def perform(self, node, inputs, outs): def perform(self, node, inputs, outs):
out, = outs out, = outs
v = inputs[0] v = inputs[0]
...@@ -477,6 +498,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -477,6 +498,7 @@ class GpuAlloc(HideC, Alloc):
ndim = len(inp[1:]) ndim = len(inp[1:])
zz, = out zz, = out
memset_0 = int(self.memset_0)
code = """ code = """
int i; int i;
size_t %(name)s_shape[%(ndim)s]; size_t %(name)s_shape[%(ndim)s];
...@@ -503,12 +525,24 @@ class GpuAlloc(HideC, Alloc): ...@@ -503,12 +525,24 @@ class GpuAlloc(HideC, Alloc):
%(fail)s %(fail)s
} }
} }
if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga))
if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) { {
int err = GpuArray_memset(&%(zz)s->ga, 0);
if (err != GA_NO_ERROR)
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%d"
" element of device memory to 0.",
PyGpuArray_SIZE(%(zz)s));
%(fail)s;
}
}
else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_ValueError, "setarray failed"); PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s %(fail)s
} }
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, fail=sub['fail']) """ % dict(name=name, ndim=ndim, zz=zz, vv=vv,
fail=sub['fail'], memset_0=memset_0)
if config.gpuarray.sync: if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz) code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
......
...@@ -13,12 +13,15 @@ from theano.sandbox.gpuarray.type import GpuArrayType ...@@ -13,12 +13,15 @@ from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
gpu_from_host, gpu_from_host,
gpu_alloc, GpuReshape, gpu_alloc,
GpuAlloc,
GpuReshape,
GpuEye) GpuEye)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce) GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm from theano.sandbox.gpuarray.type import GpuArrayConstant
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
...@@ -126,6 +129,18 @@ def local_gpualloc(node): ...@@ -126,6 +129,18 @@ def local_gpualloc(node):
return gpu_alloc return gpu_alloc
@register_opt()
@local_optimizer([GpuAlloc])
def local_gpualloc_memset_0(node):
if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0]
if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
@register_opt() @register_opt()
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
def local_gpureshape(node): def local_gpureshape(node):
......
...@@ -2,7 +2,8 @@ import numpy ...@@ -2,7 +2,8 @@ import numpy
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.sandbox.gpuarray.basic_ops import GpuReshape from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
from theano.sandbox.gpuarray.elemwise import GpuCAReduce
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
...@@ -70,3 +71,36 @@ def test_sum_prod(): ...@@ -70,3 +71,36 @@ def test_sum_prod():
assert res.shape == () assert res.shape == ()
assert GpuCAReduce in [type(node.op) assert GpuCAReduce in [type(node.op)
for node in f.maker.fgraph.toposort()] for node in f.maker.fgraph.toposort()]
def test_local_gpualloc_memset_0():
i = theano.tensor.iscalar()
z = numpy.zeros((1,), dtype='float32')
o = numpy.ones((1,), dtype='float32')
ones = numpy.ones((2,), dtype='float32')
# Test with 0
a = gpu_alloc(z, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 0).all()
# Test with 1
a = gpu_alloc(o, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 1).all()
# Test with 1, 1
a = gpu_alloc(ones, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0
assert (numpy.asarray(f(2)) == 1).all()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论