提交 0f54f86e authored 作者: Frederic Bastien's avatar Frederic Bastien

detect at run time the gpu pointeur size and int size. Use that to allow fusing…

detect at run time the gpu pointeur size and int size. Use that to allow fusing more gpu elemwise together
上级 23ce2e0f
...@@ -1831,6 +1831,33 @@ static PyTypeObject CudaNdarrayType = ...@@ -1831,6 +1831,33 @@ static PyTypeObject CudaNdarrayType =
CudaNdarray_new, /* tp_new */ CudaNdarray_new, /* tp_new */
}; };
static __global__ void get_gpu_ptr_size(int* dst)
{
dst[0] = sizeof(float*);
}
PyObject *
CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
{
int *gpu_data = (int*)device_malloc(sizeof(int));
if(gpu_data == NULL){
return PyErr_Format(PyExc_MemoryError,
"CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
}
get_gpu_ptr_size<<<1,1>>>(gpu_data);
if (cudaSuccess != cublasGetError()){
return PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_ptr_int_size: error when calling the gpu code.");
}
int gpu_ptr_size = -1;
cublasGetVector(1, sizeof(int), gpu_data, 1, &gpu_ptr_size, 1);
device_free(gpu_data);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
return NULL;
}
return Py_BuildValue("iii", gpu_ptr_size, sizeof(float*), sizeof(int));
}
// Initialize the gpu. // Initialize the gpu.
// Takes one optional parameter, the device number. // Takes one optional parameter, the device number.
...@@ -2068,6 +2095,7 @@ static PyMethodDef module_methods[] = { ...@@ -2068,6 +2095,7 @@ static PyMethodDef module_methods[] = {
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."}, {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."}, {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
{"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."}, {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
{"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
{"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."}, {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
{"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"}, {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
{NULL, NULL, NULL, NULL} /* Sentinel */ {NULL, NULL, NULL, NULL} /* Sentinel */
......
...@@ -735,24 +735,33 @@ optdb.register('InplaceGpuBlasOpt', ...@@ -735,24 +735,33 @@ optdb.register('InplaceGpuBlasOpt',
max_use_ratio=5), max_use_ratio=5),
70.0, 'fast_run', 'inplace') 70.0, 'fast_run', 'inplace')
gpu_ptr_size = 8
cpu_ptr_size = 8
int_size = 8
try:
#RETURN (gpu ptr size, cpu ptr size, int sizes)
t = cuda_ndarray.cuda_ndarray.ptr_int_size()
gpu_ptr_size, cpu_ptr_size, int_size = t
except Exceptin, e:
print "OPTIMIZATION WARNING: Got the next error, but we can ignore. This could cause less GpuElemwise fused together."
print e
def max_inputs_to_GpuElemwise(node): def max_inputs_to_GpuElemwise(node):
""" """
return the maximum number of input this Apply node to an GpuElemwise can accept. return the maximum number of input this Apply node to an GpuElemwise can accept.
This is needed as currently their is a limit of 256 bytes of paramter for the gpu function. This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits. This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
""" """
#TODO: detect the size of gpu pointeur and c int.
int_size = 8
ptr_size = 8
argument_limit = 256 # if was 240, with this note: 16 bytes are used for block and thread coords etc. argument_limit = 232 # some bytes are used for block and thread coords etc.
ndim = node.inputs[0].type.ndim
size_param_mandatory = int_size #for numels size_param_mandatory = int_size #for numels
size_param_mandatory += int_size * node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1 size_param_mandatory += int_size * ndim # for the shape
size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs) size_param_mandatory += sum((gpu_ptr_size + int_size * ndim) for i in node.outputs)
nb_bytes_avail = argument_limit-size_param_mandatory nb_bytes_avail = argument_limit - size_param_mandatory
nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size nb_bytes_per_inputs = (ndim*int_size) + gpu_ptr_size
max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs max_nb_inputs = nb_bytes_avail // nb_bytes_per_inputs
return max_nb_inputs return max_nb_inputs
def split_huge_add_or_mul(node): def split_huge_add_or_mul(node):
......
...@@ -161,8 +161,9 @@ def test_huge_elemwise_fusion(): ...@@ -161,8 +161,9 @@ def test_huge_elemwise_fusion():
in case their is too many inputs and that would make it bust the 256 in case their is too many inputs and that would make it bust the 256
bytes limits. bytes limits.
""" """
shape = (3,4,5,6) shape = (2,3,4,5,6)
vars = [tensor.tanh(tensor.ftensor4()) for x in range(10)] ttype = tensor.tensor(dtype='float32',broadcastable=(False,)*len(shape))
vars = [tensor.tanh(ttype) for x in range(10)]
f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu) f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
topo = f.maker.env.toposort() topo = f.maker.env.toposort()
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
...@@ -170,7 +171,7 @@ def test_huge_elemwise_fusion(): ...@@ -170,7 +171,7 @@ def test_huge_elemwise_fusion():
# print >> sys.stdout, i, node # print >> sys.stdout, i, node
assert len(topo)==10 assert len(topo)==10
assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==2 assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==2
assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Composite) assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Sub)
assert isinstance(topo[8].op.scalar_op,theano.scalar.basic.Composite) assert isinstance(topo[8].op.scalar_op,theano.scalar.basic.Composite)
#let debugmode catch errors #let debugmode catch errors
gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32') gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
......
...@@ -876,8 +876,7 @@ class test_fusion(unittest.TestCase): ...@@ -876,8 +876,7 @@ class test_fusion(unittest.TestCase):
self.do(mode, cuda.float32_shared_constructor, shp, gpu=True) self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
def test_gpu_fusion_3d(self): def test_gpu_fusion_Xd(self):
shp=(5,5,5)
#we need the optimisation enabled, debug do this. #we need the optimisation enabled, debug do this.
if theano.config.mode == "FAST_COMPILE": if theano.config.mode == "FAST_COMPILE":
mode = theano.compile.mode.get_mode("FAST_RUN").including('local_elemwise_fusion','canonicalize','gpu') mode = theano.compile.mode.get_mode("FAST_RUN").including('local_elemwise_fusion','canonicalize','gpu')
...@@ -886,7 +885,10 @@ class test_fusion(unittest.TestCase): ...@@ -886,7 +885,10 @@ class test_fusion(unittest.TestCase):
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
if not cuda.cuda_available: if not cuda.cuda_available:
raise SkipTest("cuda not available") raise SkipTest("cuda not available")
if cuda.opt.int_size == 4:
shp=(5,5,5,5)
else:
shp=(5,5,5)
self.do(mode, cuda.float32_shared_constructor, shp, gpu=True) self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
def speed_fusion(self, shared_fn = shared, gpu = False, s=None): def speed_fusion(self, shared_fn = shared, gpu = False, s=None):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论