Remove non-ported code

b922c47c · Vincent Dumoulin · a3102cd3 · b922c47c · b922c47c
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -407,350 +407,3 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax (GpuOp):
-    """
-    Implement Softmax on the gpu.
-    """
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x):
-        return Apply(self, [x], [x.type()])
-    def infer_shape(self, node, shape):
-        return shape
-    def c_code_cache_version(self):
-        return (9,) + inline_softmax.code_version
-    def c_code(self, node, nodename, inp, out, sub):
-        x, = inp
-        z, = out
-        fail = sub['fail']
-        return """
-        if (%(x)s->nd != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if ((NULL == %(z)s) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
-             CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
-             CudaNdarray_HOST_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = (CudaNdarray*)CudaNdarray_New();
-            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 2,
-                                                CudaNdarray_HOST_DIMS(%(x)s)))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = NULL;
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                                    32 * 1024);
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
-            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
-                                     2 * sizeof(float);
-            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
-            {
-              //Those numbers are based on not too recent GPU
-              //to make them compatible with more GPU.
-              //TODO: read the information from the card.
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmax_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                            CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(x)s),
-                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(z)s),
-                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }else{
-                kSoftmax_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(float)
-                    >>>(
-                            CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(x)s),
-                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                            CudaNdarray_DEV_DATA(%(z)s),
-                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }
-              CNDA_THREAD_SYNC;
-              cudaError_t err = cudaGetLastError();
-              if( cudaSuccess != err)
-              {
-                  PyErr_Format(PyExc_RuntimeError,
-                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                               " %%d threads %%d bytes of shared memory",
-                               "kSoftmax[_fixed_shared]%(nodename)s",
-                               cudaGetErrorString(err),
-                               n_blocks, n_threads, n_shared_bytes);
-                  %(fail)s;
-              }
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
-                params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
-                    "extern __shared__ float buf[]",
-                    "float * buf2 = buf + N",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                        "buf2[tx] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                      inline_softmax('N', 'buf', 'buf2',
-                                     'threadIdx.x', 'blockDim.x'),
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                    "}",
-                ])
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
-                params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
-                    "extern __shared__ float buf[]",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "const float *x_ptr = &x[blockIDX * sx0]",
-                      "float *sm_ptr = &sm[blockIDX * sm_s0]",
-                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
-                                                  'sm_ptr', 'sm_s1',
-                                                  'threadIdx.x', 'blockDim.x'),
-                      "__syncthreads()",
-                    "}",
-                    ])
-        return ret1 + "\n" + ret2
-gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias (GpuOp):
-    """
-    Implement SoftmaxWithBias on the gpu.
-    """
-    nin = 2
-    nout = 1
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x, b):
-        return Apply(self, [x, b], [x.type()])
-    def infer_shape(self, node, shape):
-        return  [shape[0]]
-    def c_code_cache_version(self):
-        #return ()
-        return (8,) + inline_softmax.code_version
-    def c_code(self, node, nodename, inp, out, sub):
-        x, b = inp
-        z, = out
-        fail = sub['fail']
-        return """
-        if (%(x)s->nd != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error input");
-            %(fail)s;
-        }
-        if (%(b)s->nd != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
-            %(fail)s;
-        }
-        if ((CudaNdarray_HOST_DIMS(%(x)s)[1] !=
-            CudaNdarray_HOST_DIMS(%(b)s)[0]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "number of columns in x (%%ld)"
-                         " does not match length of b (%%ld)",
-                         (long int)CudaNdarray_HOST_DIMS(%(x)s)[1],
-                         (long int)CudaNdarray_HOST_DIMS(%(b)s)[0]);
-            %(fail)s;
-        }
-        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
-                CudaNdarray_HOST_DIMS(%(x)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
-                CudaNdarray_HOST_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = (CudaNdarray*)CudaNdarray_New();
-            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 2,
-                       CudaNdarray_HOST_DIMS(%(x)s)))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = NULL;
-                %(fail)s;
-            }
-        }
-        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
-//TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
-            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
-                                     2 * sizeof(float);
-            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
-            {
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmaxWithBias_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                        CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        CudaNdarray_HOST_DIMS(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(b)s),
-                        CudaNdarray_HOST_STRIDES(%(b)s)[0],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }else{
-                kSoftmaxWithBias_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(float)
-                    >>>(
-                        CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        CudaNdarray_HOST_DIMS(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(b)s),
-                        CudaNdarray_HOST_STRIDES(%(b)s)[0],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                    );
-              }
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                                 "kSoftmaxWithBias_%(nodename)s",
-                                 cudaGetErrorString(err));
-                    %(fail)s;
-                }
-            }
-        }
-        assert(%(z)s);
-        """ % locals()
-    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
-                params=['int M', 'int N',
-                        'const float * x', 'const int sx0', 'const int sx1',
-                        'const float * b', 'const int sb0',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
-                body=[
-                    "extern __shared__ float buf[]",
-                    "float * buf2 = buf + N",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                    "     blockIDX += gridDim.x){",
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
-                         "buf[tx] += b[tx * sb0]",
-                         "buf2[tx] = buf[tx]",
-                      "}",
-                       "__syncthreads()",
-                       inline_softmax('N', 'buf', 'buf2',
-                                      'threadIdx.x', 'blockDim.x'),
-                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
-                      "}",
-                      "__syncthreads()",
-                    "}",
-                    ])
-        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
-                           params=['int M', 'int N',
-                                   'const float * x',
-                                   'const int sx0', 'const int sx1',
-                                   'const float * b', 'const int sb0',
-                                   'float * sm',
-                                   'const int sm_s0', 'const int sm_s1'],
-                           body=[
-                               "extern __shared__ float buf[]",
-                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                               "     blockIDX += gridDim.x){",
-                               "const float *x_ptr = &x[blockIDX * sx0]",
-                               "float *sm_ptr = &sm[blockIDX * sm_s0]",
-                               inline_softmax_fixed_shared('N', 'buf',
-                                                           'x_ptr', 'sx1',
-                                                           'sm_ptr', 'sm_s1',
-                                                           'threadIdx.x',
-                                                           'blockDim.x',
-                                                           'b', 'sb0'),
-                               "__syncthreads()",
-                               "}",
-                           ])
-        return ret1 + "\n" + ret2
-gpu_softmax_with_bias = GpuSoftmaxWithBias()
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -160,90 +160,3 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
                rtol, atol)
-def test_softmax_with_bias():
-    """
-    This is basic test for GpuSoftmaxWithBias
-    We check that we loop when their is too much block
-    TODO: check that we loop when their is too much thread.(THIS IS
-    NOT IMPLEMENTED)
-    """
-    x = T.fmatrix('x')
-    # We can't use zeros_like(x[0,::]) as this don't allow to test with
-    # 0 shape.
-    z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
-                                             dtype='float32')[::2])
-    f = theano.function([x], z, mode=mode_without_gpu)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax_with_bias
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
-                      cuda.nnet.GpuSoftmaxWithBias)
-    def cmp(n, m):
-        #print "test_softmax",n,m
-        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        out = f(data)
-        gout = f_gpu(data)
-        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    cmp(2, 5)
-    #we need to test n>32*1024 to check that we make the block loop.
-    cmp(2 << 15, 5)
-    cmp(4074, 400)
-    cmp(0, 10)
-    cmp(784, 784)
-    cmp(4, 1000)
-    cmp(4, 1024)
-    cmp(4, 2000)
-    cmp(4, 2024)
-    #GTX285 don't have enough shared mem for this case.
-    cmp(4, 4074)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
-    cmp(128, 16 * 1024)
-    cmp(128, 64 * 1024)
-def test_softmax():
-    """
-    This is basic test for GpuSoftmax
-    We check that we loop when their is too much block
-    We use slower code when there isn't enough shared memory
-    """
-    x = T.fmatrix('x')
-    z = T.nnet.softmax(x)
-    f = theano.function([x], z, mode=mode_without_gpu)
-    f_gpu = theano.function([x], z, mode=mode_with_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
-                      cuda.nnet.GpuSoftmax)
-    def cmp(n, m):
-        #print "test_softmax",n,m
-        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-        out = f(data)
-        gout = f_gpu(data)
-        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-    #we need to test n>32*1024 to check that we make the block loop.
-    cmp(2, 5)
-    cmp(2 << 15, 5)
-    cmp(4074, 400)
-    cmp(0, 10)
-    cmp(784, 784)
-    cmp(4, 1000)
-    cmp(4, 1024)
-    cmp(4, 2000)
-    cmp(4, 2024)
-    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
-    cmp(128, 16 * 1024)
-    cmp(128, 64 * 1024)