Make GpuCAReduce support more dtype, but the acc_dtype is the same as input/outputs.

3943db3c · Frederic · 26d91309 · 3943db3c
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -607,7 +607,6 @@ class GpuCAReduceCuda(HideC, CAReduce):

    def make_node(self, x):
        x = as_gpuarray_variable(x)
-        assert x.dtype == "float32"
        ret = super(GpuCAReduceCuda, self).make_node(x)
        self = copy.copy(self)
        self.axis = ret.op.axis
@@ -693,7 +692,8 @@ class GpuCAReduceCuda(HideC, CAReduce):

        nd_in = node.inputs[0].type.ndim
        nd_out = node.outputs[0].type.ndim
-
+        dtype = "npy_" + node.outputs[0].dtype
+        assert node.inputs[0].dtype == node.outputs[0].dtype
        assert nd_in - nd_out == sum(self.reduce_mask)

        sio = StringIO()
@@ -775,7 +775,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        # \begin bracket the reduction in a check that there is
        # actually work to do
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
+            zero_shp = "cudaMemset((%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(dtype)s))" % locals()
        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            scalar_op = self.scalar_op
@@ -827,16 +827,16 @@ class GpuCAReduceCuda(HideC, CAReduce):

                if (verbose)
                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+                int n_shared = sizeof(%(dtype)s) * n_threads.x * n_threads.y * n_threads.z;
                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
                                                n_shared>>>(
                        PyGpuArray_DIMS(%(x)s)[0],
                        PyGpuArray_DIMS(%(x)s)[1],
-                        (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/4,
-                        PyGpuArray_STRIDES(%(x)s)[1]/4,
-                        (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/4
+                        (%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
+                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s),
+                        (%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(dtype)s)
                        );
                [
        if config.gpuarray.sync:
@@ -848,6 +848,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    %(fail)s;
                }
        """
+        dtype = "npy_" + node.outputs[0].dtype
        sio = StringIO()
        if pattern is None:
            pattern = ''.join(str(c) for c in self.reduce_mask)
@@ -860,7 +861,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        print >> sio, """
            if (verbose)
                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
-            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            int n_shared = sizeof(%(dtype)s) * n_threads.x * n_threads.y * n_threads.z;
            if (verbose>1)
                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
@@ -876,18 +877,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    PyGpuArray_DIMS(%(x)s)[%(i)s],
            """ % locals()
        print >> sio, """
-                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
+                    (%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
            """ % locals()
        for i in xrange(ndim):
            print >> sio, """
-                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/4
+                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(dtype)s)
            """ % locals()
        print >> sio, """
-                    ,(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
+                    ,(%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
            """ % locals()
        for i in xrange(nd_out):
            print >> sio, """
-                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/4
+                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(dtype)s)
            """ % locals()
        sync = ""
        if config.gpuarray.sync:
@@ -927,17 +928,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A,
+                    const %(dtype)s *A,
                    const int sA0,
                    const int sA1,
                    const int sA2,
-                    float * Z,
+                    %(dtype)s * Z,
                    const int sZ0)

            Since the nodename is unique, we don't need to put the name
            of the scalar_op in here.

        """
+        dtype = "npy_" + node.outputs[0].dtype
        if reduce_mask is None:
            reduce_mask = self.reduce_mask
        if ndim is None:
@@ -954,14 +956,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d%(i)s,
        """ % locals()
        print >> sio, """
-                    const float *A,
+                    const %(dtype)s *A,
        """ % locals()
        for i in xrange(ndim):
            print >> sio, """
                    const int sA%(i)s,
        """ % locals()
        print >> sio, """
-                    float * Z
+                    %(dtype)s * Z
        """ % locals()
        for i in xrange(ndim - sum(reduce_mask)):
            print >> sio, """
@@ -970,13 +972,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
        print >> sio, ")"
        return sio.getvalue()

-    def _k_init(self, *args):
+    def _k_init(self, node, nodename):
+        dtype = "npy_" + node.outputs[0].dtype
        return """
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
                + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = 0.0f;
+                extern __shared__ %(dtype)s buf[];
+                %(dtype)s myresult = 0.0f;

                //This is caught in cuda/init.py when we init the gpu. I keep
                //it here to ease finding code that rely on this.
@@ -986,7 +989,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    return;
                }

-        """
+        """ % locals()

    def _assign_init(self, first_item):
        """
@@ -1037,6 +1040,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        node, name, sub: these should be passed through from the original
        call to c_code
        """
+        dtype = "npy_" + node.outputs[0].dtype

        # This code (the code in new_version) is currently ignored.
        # Code produced later in this function is returned instead.
@@ -1068,7 +1072,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
            if (threadNum < halfPoint)
            {
              // Get the shared value stored by another thread
-              float temp = buf[threadNum + halfPoint];
+              %(dtype)s temp = buf[threadNum + halfPoint];
              """

        new_version += self._assign_reduce(node, name,
@@ -1175,8 +1179,9 @@ class GpuCAReduceCuda(HideC, CAReduce):
        is for the case where we are reducing on all axes and x is
        C contiguous.
        """
+        dtype = "npy_" + node.outputs[0].dtype
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
+            zero_shp = "cudaMemset((%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(dtype)s))" % locals()
        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            zero_shp = """
@@ -1185,6 +1190,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
            %(fail)s;
            """ % locals()

+        dtype = "npy_" + node.outputs[0].dtype
        sync = ""
        if config.gpuarray.sync:
            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
@@ -1202,11 +1208,11 @@ class GpuCAReduceCuda(HideC, CAReduce):
                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
                                n_threads.x,PyGpuArray_SIZE(%(x)s),
                                PyGpuArray_NDIM(%(x)s));
-            int n_shared = sizeof(float) * n_threads.x;
+            int n_shared = sizeof(%(dtype)s) * n_threads.x;
            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    PyGpuArray_SIZE(%(x)s),
-                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
+                    (%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                    (%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
            %(sync)s
            cudaError_t sts = cudaGetLastError();
            if (cudaSuccess != sts)
@@ -1265,12 +1271,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
        """

        assert N in [1, 2, 3]
+        dtype = "npy_" + node.outputs[0].dtype
        makecall = self._makecall(node, name, x, z, fail)
        N_pattern = ''.join(['1'] * N)
        param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i)
                              for i in xrange(N + 1)])
-        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/4"
-                                % (x, i) for i in xrange(N + 1)])
+        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/sizeof(%s)"
+                                % (x, i, dtype) for i in xrange(N + 1)])

        threads_y = """
            //get as many y threads as we can fit
@@ -1326,6 +1333,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)

    def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        dtype = "npy_" + node.outputs[0].dtype
        sync = ""
        if config.gpuarray.sync:
            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
@@ -1345,18 +1353,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
                n_blocks.y);
            }
            assert( PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
-            int n_shared = sizeof(float) * n_threads.x;
+            int n_shared = sizeof(%(dtype)s) * n_threads.x;
            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    1,
                    PyGpuArray_DIMS(%(x)s)[0],
                    PyGpuArray_DIMS(%(x)s)[1],
-                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                    (%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
                    1,
-                    PyGpuArray_STRIDES(%(x)s)[0]/4,
-                    PyGpuArray_STRIDES(%(x)s)[1]/4,
-                    (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                    PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
+                    PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s),
+                    (%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
                    1,
-                    PyGpuArray_STRIDES(%(z)s)[0]/4
+                    PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(dtype)s)
                    );
            %(sync)s
            cudaError_t sts = cudaGetLastError();
@@ -1382,6 +1390,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        makecall_inner = self._makecall(node, name, x, z, fail,
                                        pattern="010_inner")
        pattern = ''.join(str(i) for i in self.reduce_mask)
+        dtype = "npy_" + node.outputs[0].dtype
        sync = ""
        if config.gpuarray.sync:
            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
@@ -1421,13 +1430,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                int n_shared = 0;
                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                A,B,C,D,
-                        (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/4,
-                        PyGpuArray_STRIDES(%(x)s)[1]/4,
-                        PyGpuArray_STRIDES(%(x)s)[2]/4,
-                        (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/4,
-                        PyGpuArray_STRIDES(%(z)s)[1]/4
+                        (%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
+                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s),
+                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(dtype)s),
+                        (%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(dtype)s),
+                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(dtype)s)
                        );
                %(sync)s
                cudaError_t sts = cudaGetLastError();
@@ -1464,10 +1473,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
                                  (size_t)n_threads.x),
                      (size_t)(4096 / n_blocks.x)
                      );
-                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/4,
-                                     PyGpuArray_STRIDES(%(x)s)[1]/4),
-                            PyGpuArray_STRIDES(%(x)s)[2]/4)
-                   ==PyGpuArray_STRIDES(%(x)s)[2]/4
+                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
+                                     PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s)),
+                            PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(dtype)s))
+                   ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(dtype)s)
                  && n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
                                             (size_t)n_threads.x)){
                  if(verbose>1)
@@ -1623,6 +1632,7 @@ class GpuCAReduceCuda(HideC, CAReduce):

    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
        makecall = self._makecall(node, name, x, z, fail)
+        dtype = "npy_" + node.outputs[0].dtype
        print >> sio, """
        {
            int verbose = 0;
@@ -1642,7 +1652,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                             (size_t) 256));
            while (n_threads.x * n_threads.y <= 256
                   && n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
-                   && n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
+                   && n_threads.x * n_threads.y * sizeof(%(dtype)s) <=(15*1024-200))
            {
                n_threads.y += 1;
            }
@@ -1711,7 +1721,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        """ % locals()

    def c_code_cache_version_apply(self, node):
-        version = [9]  # the version corresponding to the c code in this Op
+        version = [10]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -1728,6 +1738,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
    def c_support_code_apply(self, node, nodename):
        sio = StringIO()
        nd_in = len(self.reduce_mask)
+        dtype = "npy_" + node.outputs[0].dtype
        if all(i == 1 for i in self.reduce_mask):
            #this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
@@ -1739,13 +1750,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
            print >> sio, """
            static __global__ void kernel_reduce_ccontig_%(nodename)s(
                    const unsigned int d0,
-                    const float *A,
-                    float * Z)
+                    const %(dtype)s *A,
+                    %(dtype)s * Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(dtype)s buf[];
+                %(dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -1770,13 +1781,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
            print >> sio, """
            static __global__ void kernel_reduce_1_%(nodename)s(
                    const unsigned int d0,
-                    const float *A, const int sA0,
-                    float * Z)
+                    const %(dtype)s *A, const int sA0,
+                    %(dtype)s * Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(dtype)s buf[];
+                %(dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -1803,13 +1814,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
            static __global__ void kernel_reduce_11_%(nodename)s(
                    const int d0,
                    const int d1,
-                    const float *A, const int sA0, const int sA1,
-                    float * Z)
+                    const %(dtype)s *A, const int sA0, const int sA1,
+                    %(dtype)s * Z)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(dtype)s buf[];
+                %(dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -1915,13 +1926,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A, const int sA0,
+                    const %(dtype)s *A, const int sA0,
                    const int sA1, const int sA2,
-                    float * Z, const int sZ0, const int sZ1)
+                    %(dtype)s * Z, const int sZ0, const int sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
+                extern __shared__ %(dtype)s buf[];

                if (warpSize != 32)
                {
@@ -1933,7 +1944,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(dtype)s myresult = %(reduce_init)s;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -1956,13 +1967,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int C,
                    const int D,
                    //const int E, // THIS is 32
-                    const float *X, const int sX0,
+                    const %(dtype)s *X, const int sX0,
                    const int sX1, const int sX2,
-                    float * Z, const int sZ0, const int sZ1)
+                    %(dtype)s * Z, const int sZ0, const int sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                float myresult = 0.0f;
+                %(dtype)s myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -2050,14 +2061,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A, const int sA0,
+                    const %(dtype)s *A, const int sA0,
                    const int sA1, const int sA2,
-                    float * Z, const int sZ0)
+                    %(dtype)s * Z, const int sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(dtype)s buf[];
+                %(dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -2145,13 +2156,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A, const int sA0,
+                    const %(dtype)s *A, const int sA0,
                    const int sA1, const int sA2,
-                    float * Z, const int sZ0, const int sZ1)
+                    %(dtype)s * Z, const int sZ0, const int sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
+                extern __shared__ %(dtype)s buf[];

                if (warpSize != 32)
                {
@@ -2162,7 +2173,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(dtype)s myresult = %(reduce_init)s;
                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -2192,7 +2203,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(dtype)s myresult = %(reduce_init)s;
                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2225,7 +2236,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(dtype)s myresult = %(reduce_init)s;
                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2279,14 +2290,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const unsigned int d1,
                    const unsigned int d2,
                    const unsigned int d3,
-                    const float *A, const int sA0, const int sA1,
+                    const %(dtype)s *A, const int sA0, const int sA1,
                    const int sA2, const int sA3,
-                    float * Z, const int sZ0)
+                    %(dtype)s * Z, const int sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(dtype)s buf[];
+                %(dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {