提交 7cc1d895 authored 作者: Frederic's avatar Frederic

New optimization that will use cudaMemset for GpuAlloc when we init memory to 0.

上级 c8934e50
...@@ -2255,20 +2255,23 @@ gpu_join = GpuJoin() ...@@ -2255,20 +2255,23 @@ gpu_join = GpuJoin()
class GpuAlloc(GpuOp): class GpuAlloc(GpuOp):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMalloc that is faster.
""" """
Implement Alloc on the gpu. def __init__(self, memset_0=False):
""" self.memset_0 = memset_0
def __init__(self):
pass
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self)) ^ hash(self.memset_0)
def __str__(self): def __str__(self):
return self.__class__.__name__ return "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
def make_node(self, value, *shape): def make_node(self, value, *shape):
#if their is unneeded transfert generated by the next line #if their is unneeded transfert generated by the next line
...@@ -2307,6 +2310,7 @@ class GpuAlloc(GpuOp): ...@@ -2307,6 +2310,7 @@ class GpuAlloc(GpuOp):
value = inputs[0] value = inputs[0]
shps = inputs[1:] shps = inputs[1:]
nd = len(shps) nd = len(shps)
memset_0 = int(self.memset_0)
str = "int dims[%(nd)s];\n" % locals() str = "int dims[%(nd)s];\n" % locals()
for idx, sh in enumerate(shps): for idx, sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals() str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
...@@ -2330,7 +2334,21 @@ class GpuAlloc(GpuOp): ...@@ -2330,7 +2334,21 @@ class GpuAlloc(GpuOp):
%(fail)s; %(fail)s;
} }
} }
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true)) if (%(memset_0)s)
{
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
CudaNdarray_SIZE(%(out)s) * 4))
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%d"
" bytes of device memory.",
CudaNdarray_SIZE(%(out)s) * 4);
Py_XDECREF(%(out)s);
%(out)s = NULL;
%(fail)s;
}
}
else if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
{ {
// exception already set // exception already set
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
...@@ -2348,7 +2366,7 @@ class GpuAlloc(GpuOp): ...@@ -2348,7 +2366,7 @@ class GpuAlloc(GpuOp):
return [None for i in inputs] return [None for i in inputs]
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
for client in node.outputs[0].clients: for client in node.outputs[0].clients:
......
...@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import ( ...@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias) GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.scan_module import scan_utils, scan_op from theano.scan_module import scan_utils, scan_op
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
...@@ -1337,6 +1338,18 @@ def local_gpualloc(node): ...@@ -1337,6 +1338,18 @@ def local_gpualloc(node):
return [new_out] return [new_out]
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc_memset_0(node):
replace = False
if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0]
if (isinstance(inp, CudaNdarrayConstant) and
numpy.asarray(inp.data) == 0 and inp.data.size == 1):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
......
...@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc(): ...@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():
assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2 assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1 assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
assert sum([node.op == B.gpu_alloc assert sum([isinstance(node.op, B.GpuAlloc)
for node in f_gpu.maker.fgraph.toposort()]) == 2 for node in f_gpu.maker.fgraph.toposort()]) == 2
assert sum([node.op == B.gpu_join assert sum([node.op == B.gpu_join
for node in f_gpu.maker.fgraph.toposort()]) == 1 for node in f_gpu.maker.fgraph.toposort()]) == 1
assert sum([node.op == B.gpu_alloc assert sum([isinstance(node.op, B.GpuAlloc)
for node in f_gpu2.maker.fgraph.toposort()]) == 2 for node in f_gpu2.maker.fgraph.toposort()]) == 2
assert sum([node.op == B.gpu_join assert sum([node.op == B.gpu_join
for node in f_gpu2.maker.fgraph.toposort()]) == 1 for node in f_gpu2.maker.fgraph.toposort()]) == 1
......
...@@ -70,6 +70,17 @@ def test_gpualloc(): ...@@ -70,6 +70,17 @@ def test_gpualloc():
assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l ) assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
def test_alloc_memset_0():
i = tensor.iscalar()
z = numpy.zeros((1,), dtype='float32')
a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 0).all()
def test_gpuspecifyshape(): def test_gpuspecifyshape():
x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x') x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
m = theano.tensor.specify_shape(x + numpy.float32(1), (3,)) m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论