Merge pull request #376 from nouiz/gpu_properties

Gpu properties

Merge pull request #376 from nouiz/gpu_properties
a6533e40 · David Warde-Farley · 888485ef · 1c9cb0a1 · a6533e40
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -599,7 +599,7 @@ __global__ void k_copy_reshape_rowmajor(unsigned int numEls,
 // Reshape self to the new shape gived by the tuple shape.
 //
 // If self is c contiguous, it return a view. Otherwise it always do a copy.
-// TODO: make it return a view when the strides allow it event if it is not 
+// TODO: make it return a view when the strides allow it even if it is not
 //       c contiguous
 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
 {
@@ -1133,7 +1133,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        CudaNdarray_HOST_STRIDES(self)[0],
                        CudaNdarray_DEV_DATA(other),
                        1, //strides
-                        1, 
+                        1,
                        CudaNdarray_HOST_STRIDES(other)[0]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
@@ -1931,6 +1931,121 @@ CudaNdarray_get_base(CudaNdarray *self, void *closure)
    return base;
 }

+void put_in_dict(PyObject * dict, const char * key, int val)
+{
+  PyObject * k = PyString_FromString(key);
+  PyObject * v = PyInt_FromLong(val);
+  PyDict_SetItem(dict, k, v);
+  Py_DECREF(k);
+  Py_DECREF(v);
+}
+
+PyObject *
+GetDeviceProperties(PyObject* _unused, PyObject* args)
+{
+  int dev_id = -1;
+  if (! PyArg_ParseTuple(args, "i", &dev_id))
+    return NULL;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, dev_id);
+
+  PyObject * dict = PyDict_New();
+  PyObject * str= PyString_FromString("name");
+  PyObject * i = PyString_FromString(deviceProp.name);
+  PyDict_SetItem(dict, str, i);
+  Py_DECREF(str);
+  Py_DECREF(i);
+
+  put_in_dict(dict, "major", deviceProp.major);
+  put_in_dict(dict, "minor", deviceProp.minor);
+#if CUDART_VERSION >= 2020
+  int driverVersion = 0, runtimeVersion = 0;
+  cudaDriverGetVersion(&driverVersion);
+  cudaRuntimeGetVersion(&runtimeVersion);
+  put_in_dict(dict, "driverVersion", driverVersion);
+  put_in_dict(dict, "runtimeVersion", runtimeVersion);
+#endif
+#if CUDART_VERSION >= 2000
+
+  put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
+  //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
+  int sm_cores = -1;
+  if(deviceProp.major==1)
+    sm_cores = 32;
+  else if(deviceProp.major==2 && deviceProp.minor==0)
+    sm_cores = 32;
+  else if(deviceProp.major==2 && deviceProp.minor==1)
+    sm_cores = 48;
+  put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
+#endif
+  put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
+  put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
+  put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
+  put_in_dict(dict, "warpSize", deviceProp.warpSize);
+  put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
+  put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
+  put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
+  put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
+  put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
+  put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
+  put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
+  put_in_dict(dict, "memPitch", deviceProp.memPitch);
+  put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
+  put_in_dict(dict, "clockRate", deviceProp.clockRate);
+#if CUDART_VERSION >= 2000
+  put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
+#endif
+#if CUDART_VERSION >= 2020
+  put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
+  put_in_dict(dict, "integrated", deviceProp.integrated);
+  put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
+  put_in_dict(dict, "computeMode", deviceProp.computeMode);
+  //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
+#endif
+#if CUDART_VERSION >= 3000
+  put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
+#endif
+#if CUDART_VERSION >= 3010
+  put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
+#endif
+#if CUDART_VERSION >= 3020
+  put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
+#endif
+
+  return dict;
+}
+
+/*
+ * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
+ */
+PyObject *
+GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
+{
+    size_t free = 0, total = 0;
+    if(g_gpu_context_active == 0){
+        PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
+        return NULL;
+    }
+
+    cudaError_t err = cudaMemGetInfo(&free, &total);
+    if (err != cudaSuccess){
+        PyErr_Format(PyExc_RuntimeError, "Error while getting memory info about the gpu %d");
+        return NULL;
+    }
+    return PyTuple_Pack(2, PyLong_FromLong(free), PyLong_FromLong(total));
+}
+
+#if COMPUTE_GPU_MEM_USED
+/*
+ * Return the size in bytes that Theano currently have allocated on the gpu.
+ */
+PyObject *
+GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
+{
+    return PyLong_FromLong(_allocated_size);
+}
+#endif
+
 static PyGetSetDef CudaNdarray_getset[] = {
    {"shape",
        (getter)CudaNdarray_get_shape,
@@ -2418,6 +2533,11 @@ static PyMethodDef module_methods[] = {
    {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
    {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
    {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
+    {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
+    {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
+#if COMPUTE_GPU_MEM_USED
+    {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
+#endif
    {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},