detect at run time the gpu pointeur size and int size. Use that to allow fusing…

detect at run time the gpu pointeur size and int size. Use that to allow fusing more gpu elemwise together

detect at run time the gpu pointeur size and int size. Use that to allow fusing…
0f54f86e · Frederic Bastien · 23ce2e0f · 0f54f86e · 0f54f86e · 0f54f86e
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1831,6 +1831,33 @@ static PyTypeObject CudaNdarrayType =
    CudaNdarray_new,           /* tp_new */
 };
+static __global__ void get_gpu_ptr_size(int* dst)
+{
+  dst[0] = sizeof(float*);
+}
+PyObject *
+CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
+{
+  int *gpu_data = (int*)device_malloc(sizeof(int));
+  if(gpu_data == NULL){
+    return PyErr_Format(PyExc_MemoryError,
+			"CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
+  }
+  get_gpu_ptr_size<<<1,1>>>(gpu_data);
+  if (cudaSuccess != cublasGetError()){
+    return PyErr_Format(PyExc_RuntimeError,
+			"CudaNdarray_ptr_int_size: error when calling the gpu code.");
+  }
+  int gpu_ptr_size = -1;
+  cublasGetVector(1, sizeof(int), gpu_data, 1, &gpu_ptr_size, 1);
+  device_free(gpu_data);
+  if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
+    PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
+    return NULL;
+  }
+  return Py_BuildValue("iii", gpu_ptr_size, sizeof(float*), sizeof(int));
+}
 // Initialize the gpu.
 // Takes one optional parameter, the device number.
@@ -2068,6 +2095,7 @@ static PyMethodDef module_methods[] = {
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
    {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
+    {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},    
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
    {NULL, NULL, NULL, NULL}  /* Sentinel */

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -735,24 +735,33 @@ optdb.register('InplaceGpuBlasOpt',
            max_use_ratio=5),
               70.0, 'fast_run', 'inplace')
+gpu_ptr_size = 8
+cpu_ptr_size = 8
+int_size = 8
+try:
+    #RETURN (gpu ptr size, cpu ptr size, int sizes)
+    t = cuda_ndarray.cuda_ndarray.ptr_int_size()
+    gpu_ptr_size, cpu_ptr_size, int_size = t
+except Exceptin, e:
+    print "OPTIMIZATION WARNING: Got the next error, but we can ignore. This could cause less GpuElemwise fused together."
+    print e
 def max_inputs_to_GpuElemwise(node):
    """
    return the maximum number of input this Apply node to an GpuElemwise can accept.
    This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
    This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
    """
-    #TODO: detect the size of gpu pointeur and c int.
-    int_size = 8
-    ptr_size = 8
-    argument_limit = 256  # if was 240, with this note: 16 bytes are used for block and thread coords etc.
+    argument_limit = 232  # some bytes are used for block and thread coords etc.
+    ndim = node.inputs[0].type.ndim
    size_param_mandatory = int_size #for numels
-    size_param_mandatory += int_size *  node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
+    size_param_mandatory += int_size *  ndim # for the shape
-    size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
+    size_param_mandatory += sum((gpu_ptr_size + int_size * ndim) for i in node.outputs)
-    nb_bytes_avail = argument_limit-size_param_mandatory
+    nb_bytes_avail = argument_limit - size_param_mandatory
-    nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
+    nb_bytes_per_inputs = (ndim*int_size) + gpu_ptr_size
-    max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
+    max_nb_inputs = nb_bytes_avail // nb_bytes_per_inputs
    return max_nb_inputs
 def split_huge_add_or_mul(node):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -161,8 +161,9 @@ def test_huge_elemwise_fusion():
        in case their is too many inputs and that would make it bust the 256
        bytes limits.
    """
-    shape = (3,4,5,6)
+    shape = (2,3,4,5,6)
-    vars = [tensor.tanh(tensor.ftensor4()) for x in range(10)]
+    ttype = tensor.tensor(dtype='float32',broadcastable=(False,)*len(shape))
+    vars = [tensor.tanh(ttype) for x in range(10)]
    f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
    topo = f.maker.env.toposort()
    #theano.printing.debugprint(f)
@@ -170,7 +171,7 @@ def test_huge_elemwise_fusion():
    #    print >> sys.stdout, i, node
    assert len(topo)==10
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==2
-    assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Composite)
+    assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Sub)
    assert isinstance(topo[8].op.scalar_op,theano.scalar.basic.Composite)
    #let debugmode catch errors
    gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -876,8 +876,7 @@ class test_fusion(unittest.TestCase):
        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
-    def test_gpu_fusion_3d(self):
+    def test_gpu_fusion_Xd(self):
-        shp=(5,5,5)
        #we need the optimisation enabled, debug do this.
        if theano.config.mode == "FAST_COMPILE":
            mode = theano.compile.mode.get_mode("FAST_RUN").including('local_elemwise_fusion','canonicalize','gpu')
@@ -886,7 +885,10 @@ class test_fusion(unittest.TestCase):
        import theano.sandbox.cuda as cuda
        if not cuda.cuda_available:
            raise SkipTest("cuda not available")
+        if cuda.opt.int_size == 4:
+            shp=(5,5,5,5)
+        else:
+            shp=(5,5,5)
        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
    def speed_fusion(self, shared_fn = shared, gpu = False, s=None):