提交 18708214 authored 作者: lamblin's avatar lamblin

Merge pull request #942 from nouiz/gpu_alloc2

Gpu alloc2
......@@ -164,7 +164,7 @@ if compile_cuda_ndarray and cuda_available:
code,
location=cuda_ndarray_loc,
include_dirs=[cuda_path], libs=['cublas'],
preargs=compiler.compile_args())
preargs=['-O3'] + compiler.compile_args())
from cuda_ndarray.cuda_ndarray import *
except Exception, e:
_logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
......
......@@ -2255,20 +2255,28 @@ gpu_join = GpuJoin()
class GpuAlloc(GpuOp):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMalloc that is faster.
"""
Implement Alloc on the gpu.
"""
def __init__(self):
pass
def __init__(self, memset_0=False):
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other)
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self))
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self):
return self.__class__.__name__
#Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
def make_node(self, value, *shape):
#if their is unneeded transfert generated by the next line
......@@ -2307,6 +2315,7 @@ class GpuAlloc(GpuOp):
value = inputs[0]
shps = inputs[1:]
nd = len(shps)
memset_0 = int(self.memset_0)
str = "int dims[%(nd)s];\n" % locals()
for idx, sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
......@@ -2330,7 +2339,21 @@ class GpuAlloc(GpuOp):
%(fail)s;
}
}
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
if (%(memset_0)s)
{
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
CudaNdarray_SIZE(%(out)s) * 4))
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%d"
" bytes of device memory.",
CudaNdarray_SIZE(%(out)s) * 4);
Py_XDECREF(%(out)s);
%(out)s = NULL;
%(fail)s;
}
}
else if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
{
// exception already set
Py_XDECREF(%(out)s);
......@@ -2348,7 +2371,7 @@ class GpuAlloc(GpuOp):
return [None for i in inputs]
def c_code_cache_version(self):
return (4,)
return (5,)
def do_constant_folding(self, node):
for client in node.outputs[0].clients:
......
......@@ -3133,24 +3133,6 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
return 0;
}
bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self)
{
bool c_contiguous = true;
int size = 1;
for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
c_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return c_contiguous;
}
PyObject *
CudaNdarray_new_nd(int nd)
{
......@@ -4346,12 +4328,6 @@ CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
return self->host_structure + 2*self->nd;
}
void
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
{
......@@ -4406,43 +4382,33 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
}
void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
int
cnda_copy_structure_to_device(const CudaNdarray * self)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
//If the device structure do not exists, create it.
//We allocate it here as we do not need it often.
//In fact, we need it so infrequently that we expect
//that most object won't need it. Not allocating it
//save a significant when creating object.
//This speed up a benchmark by 8% with the gc.
if (!self->dev_structure)
{
int struct_size = cnda_structure_size(self->nd);
if (struct_size)
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
return -1;
}
}
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
int
cnda_copy_structure_to_device(const CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
cublasSetVector(cnda_structure_size(self->nd),
sizeof(int),
self->host_structure,
1,
self->dev_structure,
1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
......@@ -4510,56 +4476,6 @@ CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
return PyInt_FromLong(CudaNdarray_SIZE(self));
}
int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
int struct_size = cnda_structure_size(nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
......
......@@ -82,8 +82,9 @@ struct CudaNdarray
//device pointers (allocated by cudaMalloc)
mutable int dev_structure_fresh;
//dev_structure should be accessed via macros, otherwise may not be synchronized
int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
//dev_structure should be accessed via macros, otherwise may not be
//synchronized. The macro will allocate it when needed.
mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
};
......@@ -126,8 +127,12 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
DllExport const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
DllExport void
cnda_mark_dev_structure_dirty(CudaNdarray * self);
DllExport inline void __attribute__((always_inline))
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
DllExport int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase);
......@@ -143,11 +148,38 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
*
* Does not sync structure to host.
*/
DllExport void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d);
DllExport inline void __attribute__((always_inline))
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
DllExport void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
DllExport inline void __attribute__((always_inline))
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
/***
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
......@@ -188,7 +220,46 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
*
* Note: This does not allocate storage for data.
*/
DllExport int CudaNdarray_set_nd(CudaNdarray * self, const int nd);
DllExport inline int __attribute__((always_inline))
CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
//The device structure will be created in cnda_copy_structure_to_device
//if needed.
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
/**
* CudaNdarray_alloc_contiguous
......@@ -218,7 +289,7 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
size = size * dim[i];
}
if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
if ((self->data_allocated == size) && CudaNdarray_is_c_contiguous(self))
{
return 0;
}
......@@ -333,7 +404,24 @@ CudaNdarray_ZEROS(int n, int * dims);
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
DllExport inline bool __attribute__((always_inline))
CudaNdarray_is_c_contiguous(const CudaNdarray * self)
{
bool c_contiguous = true;
int size = 1;
for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
c_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return c_contiguous;
}
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
......
......@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.scan_module import scan_utils, scan_op
from theano.tensor.blas import _is_real_vector, _is_real_matrix
......@@ -1337,6 +1338,19 @@ def local_gpualloc(node):
return [new_out]
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc_memset_0(node):
replace = False
if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0]
if (isinstance(inp, CudaNdarrayConstant) and
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'):
......
......@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():
assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
assert sum([isinstance(node.op, B.GpuAlloc)
for node in f_gpu.maker.fgraph.toposort()]) == 2
assert sum([node.op == B.gpu_join
for node in f_gpu.maker.fgraph.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
assert sum([isinstance(node.op, B.GpuAlloc)
for node in f_gpu2.maker.fgraph.toposort()]) == 2
assert sum([node.op == B.gpu_join
for node in f_gpu2.maker.fgraph.toposort()]) == 1
......
......@@ -62,7 +62,7 @@ def test_memory():
# When dtype is float64, only the shared is on the gpu and it is transferd
# to the cpu for computation. So no extra alloc after compilation.
# more_alloc1 if after the first compilation, more_alloc2 after the second.
for dtype, more_alloc1, more_alloc2 in [("float32", 2, 9),
for dtype, more_alloc1, more_alloc2 in [("float32", 1, 4),
("float64", 0, 0)]:
print dtype
test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)
......@@ -129,7 +129,7 @@ def test_memory_lazy():
# When dtype is float64, only the shared is on the gpu and it is transferd
# to the cpu for computation. So no extra alloc after compilation.
# more_alloc1 if after the first compilation, more_alloc2 after the second.
for dtype, more_alloc1 in [("float32", 3),
for dtype, more_alloc1 in [("float32", 2),
("float64", 0)]:
print dtype
test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)
......
......@@ -70,6 +70,39 @@ def test_gpualloc():
assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
def test_alloc_memset_0():
i = tensor.iscalar()
z = numpy.zeros((1,), dtype='float32')
o = numpy.ones((1,), dtype='float32')
ones = numpy.ones((2,), dtype='float32')
# Test with 0
a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 0).all()
# Test with 1
a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, basic_ops.GpuAlloc)
assert not topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 1).all()
# Test with 1, 1
a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, basic_ops.GpuAlloc)
assert not topo[0].op.memset_0
assert (numpy.asarray(f(2)) == 1).all()
def test_gpuspecifyshape():
x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
......
......@@ -62,8 +62,8 @@ def test_merge_with_weird_eq():
g = Env([x, y], [x+y])
MergeOptimizer().optimize(g)
assert len(g.nodes) == 1
node = list(g.nodes)[0]
assert len(g.apply_nodes) == 1
node = list(g.apply_nodes)[0]
assert len(node.inputs) == 2
assert node.inputs[0] is node.inputs[1]
......@@ -74,8 +74,7 @@ def test_merge_with_weird_eq():
g = Env([x, y], [x+y])
MergeOptimizer().optimize(g)
assert len(g.nodes) == 1
node = list(g.nodes)[0]
assert len(g.apply_nodes) == 1
node = list(g.apply_nodes)[0]
assert len(node.inputs) == 2
assert node.inputs[0] is node.inputs[1]
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论