remove code not needed anymore

961c15e0 · Frederic · 750d7815 · 961c15e0 · 961c15e0
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -710,131 +710,3 @@ class GpuSoftmaxWithBias(GpuOp):
        return ret1 + "\n" + ret2
 gpu_softmax_with_bias = GpuSoftmaxWithBias()
-class GpuSqrSumAx0(GpuOp):
-    """
-    sqr all element of the input then, sum on axis 0.
-    work only with matrix input.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x):
-        x = as_cuda_ndarray_variable(x)
-        assert x.ndim == 2
-        out = x.type.__class__(dtype='float32', broadcastable=(False,))()
-        return Apply(self, [x], [out])
-    def c_code_cache_version(self):
-        return (1,)
-    def c_code(self, node, nodename, inp, out, sub):
-        x, = inp
-        z, = out
-        fail = sub['fail']
-        return """
-        if (%(x)s->nd != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if ((NULL == %(z)s) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
-             CudaNdarray_HOST_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = (CudaNdarray*)CudaNdarray_New();
-            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 1,
-                                                CudaNdarray_HOST_DIMS(%(x)s) + 1))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = NULL;
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
-                                    32 * 1024);
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], 512);
-            int n_shared_bytes = n_threads * sizeof(float);
-            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0 &&
-                CudaNdarray_HOST_DIMS(%(x)s)[1] > 0)
-            {
-             KSqrSumAx0
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(float)
-                    >>>(
-                            CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(x)s),
-                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(z)s),
-                            CudaNdarray_HOST_STRIDES(%(z)s)[0]
-                    );
-              CNDA_THREAD_SYNC;
-              cudaError_t err = cudaGetLastError();
-              if( cudaSuccess != err)
-              {
-                  PyErr_Format(PyExc_RuntimeError,
-                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                               " %%d threads %%d bytes of shared memory",
-                               "kSoftmax[_fixed_shared]%(nodename)s",
-                               cudaGetErrorString(err),
-                               n_blocks, n_threads, n_shared_bytes);
-                  %(fail)s;
-              }
-            }
-            else if (CudaNdarray_HOST_DIMS(%(z)s)[0] > 0){
-                cudaMemset(%(z)s->devdata, 0,
-                           CudaNdarray_SIZE(%(z)s) * sizeof(float));
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-    def c_support_code(self):
-        return """
-//Not well optimized, we don't read in contiguous blocks
-__global__ void KSqrSumAx0(int nb_row, int nb_col,
-        const float* x, int x_str0, int x_str1, float* z, int z_str0) {
-    const int blockCount = gridDim.x;
-    const int blockNum = blockIdx.x;
-    const int threadCount = blockDim.x;
-    const int threadNum = threadIdx.x;
-    extern __shared__ float buf[];
-    float myresult = 0.0f;
-    for (int i = blockIdx.x; i < nb_col; i += gridDim.x) {
-        myresult = 0;
-        for (int j = threadIdx.x; j < nb_row; j += blockDim.x) {
-            float val = x[i + j*nb_col];
-            myresult += val * val;
-        }
-        __syncthreads();
-        buf[threadIdx.x] = myresult;
-        __syncthreads();
-        if(threadIdx.x==0){
-            for(int j=1;j<blockDim.x;j++)
-                myresult += buf[j];
-            z[i] = myresult;
-        }
-        __syncthreads();
-    }
-}"""
-gpu_sqr_sum_ax0 = GpuSqrSumAx0()
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -247,41 +247,3 @@ def test_softmax():
    cmp(2, 10000)
    cmp(128, 16 * 1024)
    cmp(128, 64 * 1024)
-def test_sqr_sum_ax0():
-    x = T.fmatrix('x')
-    z = (x**2).sum(axis=0)
-    f = theano.function([x], z, mode=mode_without_gpu)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
-                      cuda.nnet.GpuSqrSumAx0)
-    def cmp(n, m):
-        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        out = f(data)
-        gout = f_gpu(data)
-        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    #cmp(10, 15)
-    #cmp(120000, 15)
-    #cmp(15, 120000)
-    #cmp(4000, 4000)
-    #cmp(0, 15)
-    #cmp(10, 0)
-    #cmp(0, 0)
-    m = mode_with_gpu.excluding("local_gpu_sqr_sum_ax0")
-    f_gpu2 = theano.function([x], z, mode=m)
-    n, m = 4000, 4000
-    data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-    import time
-    t0 = time.time()
-    for i in range(1000):
-        f_gpu(data)
-    t1 = time.time()
-    for i in range(1000):
-        f_gpu2(data)
-    t2 = time.time()
-    print t1 - t0, t2 - t1