提交 7cc1d895 authored 作者: Frederic's avatar Frederic

New optimization that will use cudaMemset for GpuAlloc when we init memory to 0.

上级 c8934e50
......@@ -2255,20 +2255,23 @@ gpu_join = GpuJoin()
class GpuAlloc(GpuOp):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMalloc that is faster.
"""
Implement Alloc on the gpu.
"""
def __init__(self):
pass
def __init__(self, memset_0=False):
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other)
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self))
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self):
return self.__class__.__name__
return "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
def make_node(self, value, *shape):
#if their is unneeded transfert generated by the next line
......@@ -2307,6 +2310,7 @@ class GpuAlloc(GpuOp):
value = inputs[0]
shps = inputs[1:]
nd = len(shps)
memset_0 = int(self.memset_0)
str = "int dims[%(nd)s];\n" % locals()
for idx, sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
......@@ -2330,7 +2334,21 @@ class GpuAlloc(GpuOp):
%(fail)s;
}
}
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
if (%(memset_0)s)
{
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
CudaNdarray_SIZE(%(out)s) * 4))
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%d"
" bytes of device memory.",
CudaNdarray_SIZE(%(out)s) * 4);
Py_XDECREF(%(out)s);
%(out)s = NULL;
%(fail)s;
}
}
else if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
{
// exception already set
Py_XDECREF(%(out)s);
......@@ -2348,7 +2366,7 @@ class GpuAlloc(GpuOp):
return [None for i in inputs]
def c_code_cache_version(self):
return (4,)
return (5,)
def do_constant_folding(self, node):
for client in node.outputs[0].clients:
......
......@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.scan_module import scan_utils, scan_op
from theano.tensor.blas import _is_real_vector, _is_real_matrix
......@@ -1337,6 +1338,18 @@ def local_gpualloc(node):
return [new_out]
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc_memset_0(node):
replace = False
if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0]
if (isinstance(inp, CudaNdarrayConstant) and
numpy.asarray(inp.data) == 0 and inp.data.size == 1):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'):
......
......@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():
assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
assert sum([isinstance(node.op, B.GpuAlloc)
for node in f_gpu.maker.fgraph.toposort()]) == 2
assert sum([node.op == B.gpu_join
for node in f_gpu.maker.fgraph.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
assert sum([isinstance(node.op, B.GpuAlloc)
for node in f_gpu2.maker.fgraph.toposort()]) == 2
assert sum([node.op == B.gpu_join
for node in f_gpu2.maker.fgraph.toposort()]) == 1
......
......@@ -70,6 +70,17 @@ def test_gpualloc():
assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
def test_alloc_memset_0():
i = tensor.iscalar()
z = numpy.zeros((1,), dtype='float32')
a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 0).all()
def test_gpuspecifyshape():
x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论