Removed tabs and trailing spaces.

dd7f9af2 · Pascal Lamblin · 916d2267 · dd7f9af2
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -52,9 +52,9 @@ void * device_malloc(size_t size)
 #if COMPUTE_GPU_MEM_USED
    for(int i=0;i<TABLE_SIZE;i++){
      if(NULL==_alloc_size_table[i].ptr){
-	_alloc_size_table[i].ptr=rval;
-	_alloc_size_table[i].size=size;
-	break;
+        _alloc_size_table[i].ptr=rval;
+        _alloc_size_table[i].size=size;
+        break;
      }
    }
    _allocated_size += size;
@@ -91,12 +91,12 @@ int device_free(void *ptr)
    size_t total_freed = 0;
    for(;i<TABLE_SIZE;i++)
      if(_alloc_size_table[i].ptr==ptr){
-	_allocated_size -= _alloc_size_table[i].size;
-    total_freed += _alloc_size_table[i].size;
-	_alloc_size_table[i].ptr=0;
-	_alloc_size_table[i].size=0;
-	
-	break;
+        _allocated_size -= _alloc_size_table[i].size;
+        total_freed += _alloc_size_table[i].size;
+        _alloc_size_table[i].ptr=0;
+        _alloc_size_table[i].size=0;
+
+        break;
      }
    if(i==TABLE_SIZE)
      printf("Unallocated unknow size!\n");
@@ -161,10 +161,10 @@ CudaNdarray_uninit(CudaNdarray*self)
 }


-//make the rightmost coords change fastest 
+//make the rightmost coords change fastest
 //TODO: why does a downward for-loop not work????
 //TODO: use the log2_dims and driver code to remove / and %
-//TODO: skip the last division (when d == 0) 
+//TODO: skip the last division (when d == 0)
 #define decl_k_elemwise_unary_rowmajor(name, F) \
 __global__ void name (unsigned int numEls,  \
        unsigned int nd, \
@@ -183,7 +183,7 @@ __global__ void name (unsigned int numEls,  \
        for (unsigned int _d = 0; _d < nd; ++_d) \
        { \
            unsigned int d = nd - _d-1;  \
-	    /* i_d used to be unsigned, but their is a bug in nvcc 3.0. making it signed fix the bug.*/\
+            /* i_d used to be unsigned, but their is a bug in nvcc 3.0. making it signed fix the bug.*/\
            int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension   */ \
            ii = ii / dim[d]; \
            a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \
@@ -191,7 +191,7 @@ __global__ void name (unsigned int numEls,  \
        } \
        z_i[0] = F(a_i[0]); \
    } \
-} 
+}

 template<typename T> __device__ T unary_copy(T a) { return a; }
 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy<float>)
@@ -240,7 +240,7 @@ CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
    PyObject *arr=NULL;

    if (! PyArg_ParseTuple(args, "O", &arr))
-        return -1; 
+        return -1;
    if (! PyArray_Check(arr))
    {
        PyErr_SetString(PyExc_TypeError, "PyArray arg required");
@@ -249,7 +249,7 @@ CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
    int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr);
    return rval;
 }
-static PyMemberDef CudaNdarray_members[] = 
+static PyMemberDef CudaNdarray_members[] =
 {
    /*
    {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0,
@@ -272,7 +272,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
      PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
      free(npydims);
      if (!rval){
-	return NULL;
+        return NULL;
      }
      assert (PyArray_ITEMSIZE(rval) == sizeof(real));
      return rval;
@@ -313,7 +313,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
    assert (PyArray_ITEMSIZE(rval) == sizeof(real));

    cublasGetVector(PyArray_SIZE(rval), sizeof(real),
-            contiguous_self->devdata, 1, 
+            contiguous_self->devdata, 1,
            PyArray_DATA(rval), 1);
    CNDA_THREAD_SYNC;

@@ -547,7 +547,7 @@ PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
    return (PyObject*)self_sum;
 }

-__global__ void k_copy_reshape_rowmajor(unsigned int numEls, 
+__global__ void k_copy_reshape_rowmajor(unsigned int numEls,
        unsigned int a_nd, const float * a_data, const int * a_dim, const int * a_str,
        unsigned int z_nd, float * z_data, const int * z_dim, const int * z_str)
 {
@@ -560,7 +560,7 @@ __global__ void k_copy_reshape_rowmajor(unsigned int numEls,
        unsigned int a_ii = i;
        for (unsigned int _d = 0; _d < a_nd; ++_d) //make the rightmost coords change fastest
        {
-            unsigned int d = a_nd - _d-1; 
+            unsigned int d = a_nd - _d-1;
            unsigned int a_i_d = a_ii % a_dim[d];
            a_ii = a_ii / a_dim[d];
            a_i += a_i_d * a_str[d];
@@ -569,7 +569,7 @@ __global__ void k_copy_reshape_rowmajor(unsigned int numEls,
        float * z_i = z_data;
        for (unsigned int _d = 0; _d < z_nd; ++_d) //make the rightmost coords change fastest
        {
-            unsigned int d = z_nd - _d-1; 
+            unsigned int d = z_nd - _d-1;
            //i tried to make the for loop count down, but it didn't work!?
            unsigned int z_i_d = z_ii % z_dim[d];
            z_i += z_i_d * z_str[d];
@@ -611,7 +611,7 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
        return NULL;
      }
      rval_size = rval_size * rval_dims[i];
-	}
+    }
    }else{
      rval_size = PyInt_AsLong(shape);
      rval_dims[0] = rval_size;
@@ -632,24 +632,24 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
    {
        //return a view, not a copy
        CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
-	
-	if (!rval || 0 != rval->data_allocated
-	    ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
-	  {
-	    Py_XDECREF(rval);
-	    free(rval_dims);
-	    return NULL;
-	  }
-	//set dim and stride
-	int size = 1;
-	for (int i = rval_nd-1; i >= 0; --i)
-	  {
-	    CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
-	    CudaNdarray_set_dim(rval, i, rval_dims[i]);
-	    size = size * rval_dims[i];
-	  }
-	free(rval_dims);
-	return (PyObject*)rval;
+
+        if (!rval || 0 != rval->data_allocated
+            ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
+          {
+            Py_XDECREF(rval);
+            free(rval_dims);
+            return NULL;
+          }
+        //set dim and stride
+        int size = 1;
+        for (int i = rval_nd-1; i >= 0; --i)
+          {
+            CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
+            CudaNdarray_set_dim(rval, i, rval_dims[i]);
+            size = size * rval_dims[i];
+          }
+        free(rval_dims);
+        return (PyObject*)rval;
    }

    // allocate new space (TODO: test to see if we can re-use old one)
@@ -665,21 +665,21 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
    unsigned int threads_per_block = std::min(rval_size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
    unsigned int n_blocks = std::min(ceil_intdiv(rval_size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
    k_copy_reshape_rowmajor<<<n_blocks,threads_per_block>>>(
-            rval_size, 
-            self->nd, 
+            rval_size,
+            self->nd,
            CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_DIMS(self), CudaNdarray_DEV_STRIDES(self),
            rval->nd,
            CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_DIMS(rval), CudaNdarray_DEV_STRIDES(rval));

    CNDA_THREAD_SYNC;
    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) 
+    if( cudaSuccess != err)
    {
        Py_DECREF(rval);
        PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_copy_reshape_rowmajor", cudaGetErrorString(err));
        free(rval_dims);
        return NULL;
-    }                         
+    }
    free(rval_dims);
    return (PyObject*)rval;
 }
@@ -705,7 +705,7 @@ PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args)
 {
    int pos, stride;
    if (! PyArg_ParseTuple(args, "ii", &pos, &stride))
-        return NULL; 
+        return NULL;
    if ((pos < 0) || (pos >= self->nd))
    {
        PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
@@ -723,7 +723,7 @@ PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args)
 {
    int pos, dim;
    if (! PyArg_ParseTuple(args, "ii", &pos, &dim))
-        return NULL; 
+        return NULL;
    if ((pos < 0) || (pos >= self->nd))
    {
        PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
@@ -754,37 +754,37 @@ CudaNdarray_exp(CudaNdarray* self)
    }
    unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
    unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
-    k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self), 
+    k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self),
            CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self),
            CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval));

    //TODO: don't do this right away, do it when we need the result
    CNDA_THREAD_SYNC;
    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) 
+    if( cudaSuccess != err)
    {
        Py_DECREF(rval);
        PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
        return NULL;
-    }                         
+    }

    return (PyObject*)rval;
 }
-static PyMethodDef CudaNdarray_methods[] = 
+static PyMethodDef CudaNdarray_methods[] =
 {
-    {"__array__", 
+    {"__array__",
        (PyCFunction)CudaNdarray_CreateArrayObj, METH_NOARGS,
        "Copy from the device to a numpy ndarray"},
    {"__copy__",
        (PyCFunction)CudaNdarray_View, METH_NOARGS,
        "Create a shallow copy of this object. used by module copy"},
-    {"__deepcopy__", 
+    {"__deepcopy__",
        (PyCFunction)CudaNdarray_DeepCopy, METH_O,
        "Create a copy of this object"},
    {"zeros",
        (PyCFunction)CudaNdarray_Zeros, METH_STATIC,
        "Create a new CudaNdarray with specified shape, filled with zeros."},
-    {"copy", 
+    {"copy",
        (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
        "Create a copy of this object"},
    {"reduce_sum",
@@ -794,7 +794,7 @@ static PyMethodDef CudaNdarray_methods[] =
        (PyCFunction)CudaNdarray_exp, METH_NOARGS,
        "Return the exponential of all elements"},
    {"reshape",
-        (PyCFunction)CudaNdarray_Reshape, METH_O, 
+        (PyCFunction)CudaNdarray_Reshape, METH_O,
        "Return a reshaped view (or copy) of this ndarray\n\
            The required argument is a tuple of integers specifying the shape of the new ndarray."},
    {"view",
@@ -839,7 +839,7 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
    CudaNdarray * other = (CudaNdarray *)py_other;
    if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){
        PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now.");
-        return NULL;      
+        return NULL;
    }

    //standard elemwise size checks
@@ -876,7 +876,7 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
            self->devdata, other->devdata, rval->devdata, size);
    CNDA_THREAD_SYNC;
    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) 
+    if( cudaSuccess != err)
    {
        PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err));
        Py_DECREF(rval);
@@ -893,7 +893,7 @@ __global__ void name(const int d0, const int d1, const int d2,\
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
            for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
-	      F(a[i0*sA0 + i1*sA1 + i2*sA2], b[i0*sB0 + i1*sB1 + i2*sB2]); \
+              F(a[i0*sA0 + i1*sA1 + i2*sA2], b[i0*sB0 + i1*sB1 + i2*sB2]); \
            }\
        }\
    }\
@@ -901,16 +901,16 @@ __global__ void name(const int d0, const int d1, const int d2,\

 #define decl_k_elemwise_binary_inplace_rowmajor_4(name, F) \
 __global__ void name(const int d0, const int d1, const int d2, const int d3,\
-			 float* a, const int sA0, const int sA1,\
-			 const int sA2, const int sA3,\
-			 const float* b, const int sB0, const int sB1,\
-			 const int sB2, const int sB3){\
+                         float* a, const int sA0, const int sA1,\
+                         const int sA2, const int sA3,\
+                         const float* b, const int sB0, const int sB1,\
+                         const int sB2, const int sB3){\
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
            for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
-	      for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
-		F(a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3], b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]); \
-		}\
+              for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
+                F(a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3], b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]); \
+                }\
            }\
        }\
    }\
@@ -930,23 +930,23 @@ __global__ void k_iAdd_3(const int d0, const int d1, const int d2,\
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
            for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
-	      a[i0*sA0 + i1*sA1 + i2*sA2]+= b[i0*sB0 + i1*sB1 + i2*sB2]; \
+              a[i0*sA0 + i1*sA1 + i2*sA2]+= b[i0*sB0 + i1*sB1 + i2*sB2]; \
            }\
        }\
    }\
 }

 __global__ void k_iAdd_4(const int d0, const int d1, const int d2, const int d3,\
-			 float* a, const int sA0, const int sA1,\
-			 const int sA2, const int sA3,\
-			 const float* b, const int sB0, const int sB1,\
-			 const int sB2, const int sB3){\
+                         float* a, const int sA0, const int sA1,\
+                         const int sA2, const int sA3,\
+                         const float* b, const int sB0, const int sB1,\
+                         const int sB2, const int sB3){\
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
            for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
-	      for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
-		a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; \
-		}\
+              for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
+                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; \
+                }\
            }\
        }\
    }\
@@ -958,23 +958,23 @@ __global__ void k_iDiv_3(const int d0, const int d1, const int d2,\
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
            for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
-	      a[i0*sA0 + i1*sA1 + i2*sA2]/= b[i0*sB0 + i1*sB1 + i2*sB2]; \
+              a[i0*sA0 + i1*sA1 + i2*sA2]/= b[i0*sB0 + i1*sB1 + i2*sB2]; \
            }\
        }\
    }\
 }

 __global__ void k_iDiv_4(const int d0, const int d1, const int d2, const int d3,\
-			 float* a, const int sA0, const int sA1,\
-			 const int sA2, const int sA3,\
-			 const float* b, const int sB0, const int sB1,\
-			 const int sB2, const int sB3){\
+                         float* a, const int sA0, const int sA1,\
+                         const int sA2, const int sA3,\
+                         const float* b, const int sB0, const int sB1,\
+                         const int sB2, const int sB3){\
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){\
        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){\
            for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){\
-	      for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
-		a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; \
-		}\
+              for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){\
+                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; \
+                }\
            }\
        }\
    }\
@@ -1020,19 +1020,19 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
        }
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
    }
-    
+
    if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
      Py_INCREF(py_self);
      return py_self;
    }
    void (*k_iop_3)(const int, const int, const int,
-		    float*, const int, const int, const int,
-		    const float*, const int, const int, const int);
+                    float*, const int, const int, const int,
+                    const float*, const int, const int, const int);
    void (*k_iop_4)(const int, const int, const int, const int,
-		    float*, const int, const int,    
-		    const int, const int,
-		    const float*, const int, const int,
-		    const int, const int);
+                    float*, const int, const int,
+                    const int, const int,
+                    const float*, const int, const int,
+                    const int, const int);
    if(fct_nb == 0){
      k_iop_3 = k_iAdd_3;
      k_iop_4 = k_iAdd_4;
@@ -1040,7 +1040,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
      k_iop_3 = k_iDiv_3;
      k_iop_4 = k_iDiv_4;
    }
-    
+
    switch(self->nd)
    {
        case 1:
@@ -1062,7 +1062,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                        CudaNdarray_HOST_STRIDES(other)[0]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) 
+                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_3", cudaGetErrorString(err));
                    return NULL;
@@ -1078,7 +1078,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                dim3 n_threads(
                        std::min(CudaNdarray_HOST_DIMS(self)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK)
                    );
-		k_iop_3<<<n_blocks, n_threads>>>(1,
+                k_iop_3<<<n_blocks, n_threads>>>(1,
                        CudaNdarray_HOST_DIMS(self)[0],
                        CudaNdarray_HOST_DIMS(self)[1],
                        CudaNdarray_DEV_DATA(self),
@@ -1091,7 +1091,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                        CudaNdarray_HOST_STRIDES(other)[1]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) 
+                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_3", cudaGetErrorString(err));
                    return NULL;
@@ -1109,7 +1109,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                dim3 n_threads(
                        std::min(CudaNdarray_HOST_DIMS(self)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)
                    );
-		k_iop_3<<<n_blocks, n_threads>>>(
+                k_iop_3<<<n_blocks, n_threads>>>(
                        CudaNdarray_HOST_DIMS(self)[0],
                        CudaNdarray_HOST_DIMS(self)[1],
                        CudaNdarray_HOST_DIMS(self)[2],
@@ -1123,7 +1123,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                        CudaNdarray_HOST_STRIDES(other)[2]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) 
+                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_3", cudaGetErrorString(err));
                    return NULL;
@@ -1141,7 +1141,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                dim3 n_threads(
                        std::min(CudaNdarray_HOST_DIMS(self)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)
                    );
-		k_iop_4<<<n_blocks, n_threads>>>(
+                k_iop_4<<<n_blocks, n_threads>>>(
                        CudaNdarray_HOST_DIMS(self)[0],
                        CudaNdarray_HOST_DIMS(self)[1],
                        CudaNdarray_HOST_DIMS(self)[2],
@@ -1158,7 +1158,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                        CudaNdarray_HOST_STRIDES(other)[3]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) 
+                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_4", cudaGetErrorString(err));
                    return NULL;
@@ -1195,7 +1195,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
                            CudaNdarray_HOST_STRIDES(other)[4]);
                    CNDA_THREAD_SYNC;
                    cudaError_t err = cudaGetLastError();
-                    if( cudaSuccess != err) 
+                    if( cudaSuccess != err)
                    {
                        PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iop_4", cudaGetErrorString(err));
                        return NULL;
@@ -1289,7 +1289,7 @@ static PyNumberMethods CudaNdarrayNumberMethods =
 /////////////////////

 // Will by called by __len__ in Python
-static Py_ssize_t 
+static Py_ssize_t
 CudaNdarray_len(PyObject * py_self)
 {
    CudaNdarray * self = (CudaNdarray*) py_self;
@@ -1426,7 +1426,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
    {
        //elements of the tuple can be either integers or slices
        //the dimensionality of the view we will return is diminished for each slice in the tuple
-        
+
        if (PyTuple_Size(key) > self->nd)
        {
            PyErr_SetString(PyExc_IndexError, "index error");
@@ -1437,9 +1437,9 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
        int rval_nd = self->nd;
        for (int d = 0; d < PyTuple_Size(key); ++d)
        {
-	    //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
-	    //So we use PyArray_IsAnyScalar that should covert everything.
-	    rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
+            //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
+            //So we use PyArray_IsAnyScalar that should covert everything.
+            rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
        }

        //allocate our subtensor view
@@ -1455,7 +1455,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
            return NULL;
        }

-        // rval_d will refer to the current dimension in the rval.  
+        // rval_d will refer to the current dimension in the rval.
        // It will not be incremented for integer keys, but will be incremented for slice
        // keys
        int rval_d = 0;
@@ -1464,7 +1464,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
        {
            // keys can be shorter than self->nd.
            // when that happens, it means that the remaining dimensions are "full slices"
-            if (d >=PyTuple_Size(key)) 
+            if (d >=PyTuple_Size(key))
            {
                CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]);
                CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]);
@@ -1553,53 +1553,53 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *v)
    if(CudaNdarray_Check(o)  && PyArray_Check(v)){
        // We try to copy directly into this CudaNdarray from the ndarray
        CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
-	int typenum = PyArray_TYPE(v);
-	
-	if(!rval){
+        int typenum = PyArray_TYPE(v);
+
+        if(!rval){
            // CudaNdarray_Subscript failed and set the error msg.
-	    Py_XDECREF(rval);
-	    return -1;
-	}
-	if (typenum != REAL_TYPENUM){
-	    PyErr_SetString(PyExc_TypeError, "CudaNdarray.__setitem__: can only copy from float32 arrays");
-	    Py_XDECREF(rval);
-	    return -1;
-	}
-	if(! CudaNdarray_is_c_contiguous(rval)){
+            Py_XDECREF(rval);
+            return -1;
+        }
+        if (typenum != REAL_TYPENUM){
+            PyErr_SetString(PyExc_TypeError, "CudaNdarray.__setitem__: can only copy from float32 arrays");
+            Py_XDECREF(rval);
+            return -1;
+        }
+        if(! CudaNdarray_is_c_contiguous(rval)){
            PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray.__setitem__: When the new value is an ndarray the part where we copy it to must be c contiguous.");
-	    Py_XDECREF(rval);
-	    return -1;
-	}
-	if(rval->nd != ((PyArrayObject*)v)->nd){
+            Py_XDECREF(rval);
+            return -1;
+        }
+        if(rval->nd != ((PyArrayObject*)v)->nd){
            PyErr_Format(PyExc_NotImplementedError, "CudaNdarray.__setitem__: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.",
-			    rval->nd,((PyArrayObject*)v)->nd);
-	    Py_XDECREF(rval);
-	    return -1;
-	}
-	for(int i=0 ; i<rval->nd ; i++){
-	  if(CudaNdarray_HOST_DIMS(rval)[i] != ((PyArrayObject*)v)->dimensions[i]){
-	    PyErr_Format(PyExc_ValueError, "CudaNdarray.__setitem__: need same dimensions for dim %d, destination=%d, source=%ld",
+                         rval->nd,((PyArrayObject*)v)->nd);
+            Py_XDECREF(rval);
+            return -1;
+        }
+        for(int i=0 ; i<rval->nd ; i++){
+          if(CudaNdarray_HOST_DIMS(rval)[i] != ((PyArrayObject*)v)->dimensions[i]){
+            PyErr_Format(PyExc_ValueError, "CudaNdarray.__setitem__: need same dimensions for dim %d, destination=%d, source=%ld",
                i,
-			    CudaNdarray_HOST_DIMS(rval)[i],
-			    (long int)(((PyArrayObject*)v)->dimensions[i]));
-	    Py_XDECREF(rval);
-	    return -1;
-	  }
-	}
-	PyArrayObject * py_v = (PyArrayObject*)PyArray_ContiguousFromAny((PyObject*)v, typenum,
-						      rval->nd, rval->nd);
-	cublasSetVector(PyArray_SIZE(py_v),
-			sizeof(real), 
-			PyArray_DATA(py_v), 1,
-			rval->devdata, 1);
-	CNDA_THREAD_SYNC;
-	Py_XDECREF(py_v);
-	Py_XDECREF(rval);
-	if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
-	  PyErr_SetString(PyExc_RuntimeError, "CudaNdarray.__setitem__: error copying ndarray data to device memory");
-	  return -1;
-	}
-	  return 0;
+                CudaNdarray_HOST_DIMS(rval)[i],
+                (long int)(((PyArrayObject*)v)->dimensions[i]));
+            Py_XDECREF(rval);
+            return -1;
+          }
+        }
+        PyArrayObject * py_v = (PyArrayObject*)PyArray_ContiguousFromAny((PyObject*)v, typenum,
+                                rval->nd, rval->nd);
+        cublasSetVector(PyArray_SIZE(py_v),
+                        sizeof(real),
+                        PyArray_DATA(py_v), 1,
+                        rval->devdata, 1);
+        CNDA_THREAD_SYNC;
+        Py_XDECREF(py_v);
+        Py_XDECREF(rval);
+        if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
+          PyErr_SetString(PyExc_RuntimeError, "CudaNdarray.__setitem__: error copying ndarray data to device memory");
+          return -1;
+        }
+        return 0;
    }


@@ -1745,29 +1745,29 @@ CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
 }

 static PyGetSetDef CudaNdarray_getset[] = {
-    {"shape", 
-        (getter)CudaNdarray_get_shape, 
-        (setter)CudaNdarray_set_shape, 
+    {"shape",
+        (getter)CudaNdarray_get_shape,
+        (setter)CudaNdarray_set_shape,
        "shape of this ndarray (tuple)",
        NULL},
-    {"_strides", 
-        (getter)CudaNdarray_get_strides, 
-        (setter)CudaNdarray_set_strides, 
+    {"_strides",
+        (getter)CudaNdarray_get_strides,
+        (setter)CudaNdarray_set_strides,
        "data pointer strides (in elements)",
        NULL},
    //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
-    {"gpudata", 
-        (getter)CudaNdarray_get_dev_data, 
+    {"gpudata",
+        (getter)CudaNdarray_get_dev_data,
        NULL,//setter)CudaNdarray_set_dev_data,
        "device data pointer",
        NULL},
-    {"_dev_data", 
-        (getter)CudaNdarray_get_dev_data, 
+    {"_dev_data",
+        (getter)CudaNdarray_get_dev_data,
        (setter)CudaNdarray_set_dev_data,
        "device data pointer",
        NULL},
-    {"dtype", 
-        (getter)CudaNdarray_get_dtype, 
+    {"dtype",
+        (getter)CudaNdarray_get_dtype,
        NULL,
        "The dtype of the element. Now always float32",
        NULL},
@@ -1788,7 +1788,7 @@ static PyGetSetDef CudaNdarray_getset[] = {



-static PyTypeObject CudaNdarrayType = 
+static PyTypeObject CudaNdarrayType =
 {
    PyObject_HEAD_INIT(NULL)
    0,                         /*ob_size*/
@@ -1842,12 +1842,12 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
  int *gpu_data = (int*)device_malloc(sizeof(int));
  if(gpu_data == NULL){
    return PyErr_Format(PyExc_MemoryError,
-			"CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
+                        "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
  }
  get_gpu_ptr_size<<<1,1>>>(gpu_data);
  if (cudaSuccess != cublasGetError()){
    return PyErr_Format(PyExc_RuntimeError,
-			"CudaNdarray_ptr_int_size: error when calling the gpu code.");
+                        "CudaNdarray_ptr_int_size: error when calling the gpu code.");
  }
  int gpu_ptr_size = -1;
  cublasGetVector(1, sizeof(int), gpu_data, 1, &gpu_ptr_size, 1);
@@ -1872,7 +1872,7 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
  int card_number_provided = 1;

  PyArg_ParseTuple(args, "|i", &card_nb); // if we're given something wildly invalid, this will throw a TypeError
-  
+
  if(PyTuple_Size(args) == 0) {
    card_number_provided = 0;
    card_nb = 0;
@@ -1885,11 +1885,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
                        "Unable to get the number of gpus available: %s",
                        cudaGetErrorString(cudaGetLastError()));
  }
-  
+
  // as soon as the first successful call to a cuda* function is made, a
  // gpu context has been created
  g_gpu_context_active = 1;
-  
+
  if(deviceCount <= 0) {
    return PyErr_Format(PyExc_EnvironmentError,
                        "Can't use the GPU, no devices support CUDA");
@@ -1947,7 +1947,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
    PyObject * rval = NULL;

    if (! PyArg_ParseTuple(args, "OO", &l, &r))
-        return NULL; 
+        return NULL;

    if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
    {
@@ -1988,7 +1988,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
    return NULL;
 }

-static PyObject * 
+static PyObject *
 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
 {
    /*
@@ -2019,7 +2019,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
    if (strict || CudaNdarray_Check(py_data))
    {
        //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
-        if (!CudaNdarray_Check(py_data)) 
+        if (!CudaNdarray_Check(py_data))
        {
            Py_DECREF(py_data);
            Py_DECREF(broadcastable);
@@ -2096,7 +2096,7 @@ static PyMethodDef module_methods[] = {
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
    {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
    {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
-    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},    
+    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
    {NULL, NULL, NULL, NULL}  /* Sentinel */
 };
@@ -2105,7 +2105,7 @@ static PyMethodDef module_methods[] = {
 #define PyMODINIT_FUNC void
 #endif
 PyMODINIT_FUNC
-initcuda_ndarray(void) 
+initcuda_ndarray(void)
 {
    import_array();

@@ -2138,10 +2138,10 @@ initcuda_ndarray(void)
        int deviceId = 0; // TODO: what number goes here?
        cudaSetDevice(deviceId);
        cudaError_t err = cudaGetLastError();
-        if( cudaSuccess != err) 
+        if( cudaSuccess != err)
        {
            std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
-        }                         
+        }
    }
 }

@@ -2152,29 +2152,29 @@ initcuda_ndarray(void)
 //
 //////////////////////////////////////

-int 
+int
 CudaNdarray_Check(const PyObject * ob)
 {
    //TODO: doesn't work with inheritance
    return CudaNdarray_CheckExact(ob);
 }
-int 
+int
 CudaNdarray_CheckExact(const PyObject * ob)
 {
    return ((ob->ob_type == &CudaNdarrayType) ? 1 : 0);
 }

-PyObject * 
+PyObject *
 CudaNdarray_New(int nd)
 {
    CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
-    if (self == NULL) 
+    if (self == NULL)
    {
        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_new_null failed to allocate self");
        return NULL;
    }
    CudaNdarray_null_init(self);
-    
+
    if (nd == 0)
    {
        self->nd = 0;
@@ -2199,8 +2199,8 @@ CudaNdarray_New(int nd)
 //
 //////////////////////////////

-int 
-cublas_init() 
+int
+cublas_init()
 {
    cublasInit();
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
@@ -2210,8 +2210,8 @@ cublas_init()
    }
    return 0;
 }
-int 
-cublas_shutdown() 
+int
+cublas_shutdown()
 {
    cublasShutdown();
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
@@ -2222,7 +2222,7 @@ cublas_shutdown()
    return 0;
 }

-int 
+int
 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
 {
    int err = CudaNdarray_alloc_contiguous(self, obj->nd, obj->dimensions);
@@ -2242,7 +2242,7 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
        return -1;
    }
    cublasSetVector(PyArray_SIZE(py_src),
-            sizeof(real), 
+            sizeof(real),
            PyArray_DATA(py_src), 1,
            self->devdata, 1);
    CNDA_THREAD_SYNC;
@@ -2255,7 +2255,7 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
    Py_DECREF(py_src);
    return 0;
 }
-bool 
+bool
 CudaNdarray_is_c_contiguous(const CudaNdarray * self)
 {
    bool c_contiguous = true;
@@ -2344,15 +2344,15 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
    unsigned int size_source = 1;
    for (int i = 0; i< self->nd; ++i)
    {
-        if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i]) 
-	    && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
+        if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
+            && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
        {
-	  PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%d, source=%d",
-		       i, CudaNdarray_HOST_DIMS(self)[i], CudaNdarray_HOST_DIMS(other)[i]);
+          PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%d, source=%d",
+                       i, CudaNdarray_HOST_DIMS(self)[i], CudaNdarray_HOST_DIMS(other)[i]);
            return -1;
        }
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
-	size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
+        size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
    }
    if (0 == size)
    {
@@ -2396,11 +2396,11 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
                        CudaNdarray_DEV_DATA(self), CudaNdarray_HOST_STRIDES(self)[0]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) 
+                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads);
                    return -1;
-                }                         
+                }
            }; break;
        default:
            {
@@ -2409,25 +2409,25 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
                // call worker routine
                unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
                unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-		CudaNdarray * cuda_dims = other;
-		if(unbroadcast)
-		  cuda_dims = self;
+                CudaNdarray * cuda_dims = other;
+                if(unbroadcast)
+                    cuda_dims = self;
                //copy from other into self
                k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
-                        size, 
+                        size,
                        (unsigned int)other->nd,
                        (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
                        (const float*)CudaNdarray_DEV_DATA(other), (const int *)CudaNdarray_DEV_STRIDES(other),
                        CudaNdarray_DEV_DATA(self),  (const int *)CudaNdarray_DEV_STRIDES(self));
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) 
+                if( cudaSuccess != err)
                {
                    //fprint_CudaNdarray(stderr, self);
                    //fprint_CudaNdarray(stderr, other);
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block);
                    return -1;
-                }                         
+                }
            }
    };
    return 0;
@@ -2442,7 +2442,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
            || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
            || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
-    { 
+    {
        PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
                CudaNdarray_HOST_DIMS(A)[0],
                CudaNdarray_HOST_DIMS(A)[1],
@@ -2450,7 +2450,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
                CudaNdarray_HOST_DIMS(B)[1],
                CudaNdarray_HOST_DIMS(C)[0],
                CudaNdarray_HOST_DIMS(C)[1]);
-        return -1; 
+        return -1;
    }

    // a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
@@ -2475,21 +2475,21 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    int unit = 0;
    if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_STRIDES(A)[1] == 0) {
        unit |= (0x0 << 8);
-    } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_STRIDES(A)[0] == 0) { 
+    } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_STRIDES(A)[0] == 0) {
        unit |= (0x1 << 8);
    } else {
        unit |= (0x2 << 8);
    }
    if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_STRIDES(B)[1] == 0) {
        unit |= (0x0 << 4);
-    } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_STRIDES(B)[0] == 0) { 
+    } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_STRIDES(B)[0] == 0) {
        unit |= (0x1 << 4);
    } else {
        unit |= (0x2 << 4);
    }
    if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_STRIDES(C)[1] == 0) {
        unit |= (0x0 << 0);
-    } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_STRIDES(C)[0] == 0) { 
+    } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_STRIDES(C)[0] == 0) {
        unit |= (0x1 << 0);
    } else {
        unit |= (0x2 << 0);
@@ -2527,7 +2527,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    } else { \
        PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\
        return -1; \
-    } 
+    }

    switch(unit)
    {
@@ -2619,7 +2619,7 @@ static __global__ void kernel_reduce_sum(const unsigned int size_z,
                unsigned int pos_d;
                if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch
                {
-                    // this branch is not preferred, 
+                    // this branch is not preferred,
                    // because the manual said that integer mod and div operations are slow on gpu
                    pos_d = (ii % dims_a[d]);
                    ii = (ii / dims_a[d]);
@@ -2635,14 +2635,14 @@ static __global__ void kernel_reduce_sum(const unsigned int size_z,
        }
        // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing
        // do a similar loop
-        
+
        float sum = 0.0f;
        switch(n_reduce_dims)
        {
            case 0:
                {
                    sum = a_data_i[0];
-                } 
+                }
                break;
            case 1:
                {
@@ -2675,7 +2675,7 @@ static __global__ void kernel_reduce_sum(const unsigned int size_z,
                            a_data_ri += stride0;
                        }
                    }
-                }; 
+                };
                break;
            default:
                {
@@ -2693,7 +2693,7 @@ static __global__ void kernel_reduce_sum(const unsigned int size_z,
                            {
                                if (log2_dims_a[rd]==-1)
                                {
-                                    // this branch is not preferred, 
+                                    // this branch is not preferred,
                                    // because the manual said that integer mod and div operations are slow on gpu
                                    pos_d = (reduce_ii % dims_a[rd]);
                                    reduce_ii = (reduce_ii / dims_a[rd]);
@@ -2767,7 +2767,7 @@ static __global__ void kernel_reduce_sum_1011(
 * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions
 * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal.
 */
-int 
+int
 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
 {
    int verbose = 0;
@@ -2830,7 +2830,7 @@ CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)

    int n_threads_per_block = std::min(n_summations,
            NUM_VECTOR_OP_THREADS_PER_BLOCK);
-    int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block), 
+    int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block),
            NUM_VECTOR_OP_BLOCKS);
    int n_structure_cache = self->nd * 4 * sizeof(int);

@@ -2851,26 +2851,26 @@ CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
            CudaNdarray_DEV_DATA(self));
    CNDA_THREAD_SYNC;
    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess != err) 
+    if (cudaSuccess != err)
    {
        PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err));
        return -1;
    }
    return 0;
 }
-int 
+int
 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A)
 {
    PyErr_SetString(PyExc_NotImplementedError, "");
    return -1;
 }
-int 
+int
 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A)
 {
    PyErr_SetString(PyExc_NotImplementedError, "");
    return -1;
 }
-int 
+int
 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
 {
    PyErr_SetString(PyExc_NotImplementedError, "");
@@ -2886,7 +2886,7 @@ CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
 *  For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become:
 *     [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s.
 */
-int 
+int
 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern)
 {
    //TODO: pass a workspace pointer to avoid the internal malloc