提交 184216ae authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2

Optimize SUM(x^2), SUM(ABS(X)) and MAX(ABS(X)) to cuDNN reduction.
......@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase):
assert(dims[2] == 1);
assert(dims[1] == 1);
%(b)s = pygpu_view(%(w)s, Py_None);
%(b)s->ga.offset = off;
%(b)s->ga.offset += off;
%(b)s->ga.dimensions[0] = dims[0];
GpuArray_fix_flags(&%(b)s->ga);
bshp = dims[0];
......@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase):
assert(dims[2] == 1);
// We assume that the typecode matches
%(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
%(m)s->ga.offset = off;
%(m)s->ga.offset += off;
assert(dims[0] %% bshp == 0);
%(m)s->ga.dimensions[0] = dims[0] / bshp;
%(m)s->ga.dimensions[1] = bshp;
......@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase):
return code
def c_code_cache_version(self):
return (3, version())
return (4, version())
def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
......@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node):
node.op.acc_dtype == 'float64'):
return
def _identity(a):
return a
def _square(a):
return GpuElemwise(theano.scalar.basic.sqr)(a)
scal = node.op.scalar_op.name
post = _identity
if node.op.pre_scalar_op is not None:
# Might want to handle absmax, avg, norm1, norm2 here
return
# Might want to handle absmax, avg, and other cases for (norm1, norm2) here
if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
scal = 'norm2'
post = _square
elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
scal = 'norm1'
else:
return
elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
scal = 'absmax'
else:
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return
with inherit_stack_trace(node.outputs):
return (GpuDnnReduction(node.op.scalar_op.name,
node.op.axis,
node.op.acc_dtype,
node.op.dtype,
False)(node.inputs[0]),)
ret = GpuDnnReduction(scal,
node.op.axis,
node.op.acc_dtype,
node.op.dtype,
False)(node.inputs[0])
return [post(ret)]
@register_opt('cudnn')
......
......@@ -492,6 +492,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
__props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
'pre_scalar_op')
_f16_ok = True
verbose = 0
def __init__(self, scalar_op, axis=None,
reduce_mask=None, dtype=None, acc_dtype=None,
......@@ -927,12 +928,12 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = 0;
""" % locals()
def _assign_init(self, first_item):
def _assign_init(self, first_item, dtype):
"""
This return the initial value for myresult.
If the scalar op have an identity value, return it.
......@@ -949,7 +950,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
scalar.Minimum))
if self.pre_scalar_op: # TODO: multiple dtypes
# dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scalar.Scalar(dtype=dtype)()
......@@ -1093,6 +1093,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals()
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
verbose = self.verbose
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
if getattr(self.scalar_op, 'identity', None) == 0:
......@@ -1121,7 +1122,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if(PyGpuArray_SIZE(%(x)s)==0){
%(zero_shp)s;
}else{
int verbose = 0;
int verbose = %(verbose)s;
size_t numEls = PyGpuArray_SIZE(%(x)s);
size_t n_threads = std::min(numEls, (size_t) 256);
size_t n_blocks = 1;
......@@ -1142,10 +1143,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_1(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {1, 1, 1};
%(makecall)s
......@@ -1153,10 +1155,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_11(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
......@@ -1181,6 +1184,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
assert N in [1, 2, 3]
verbose = self.verbose
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
makecall = self._makecall(node, name, x, z, fail)
......@@ -1222,7 +1226,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
%(threads_y)s
%(threads_z)s
......@@ -1241,6 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
def c_code_reduce_10(self, sio, node, name, x, z, fail):
verbose = self.verbose
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
......@@ -1256,7 +1261,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
if(PyGpuArray_STRIDES(%(x)s)[0]>
PyGpuArray_STRIDES(%(x)s)[1]){
// If there are a lot of summations to do, then we can use simple parallelization -
......@@ -1334,6 +1339,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_010(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
makecall_inner = self._makecall(node, name, x, z, fail,
pattern="010_inner")
......@@ -1400,7 +1406,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
}
else
{
int verbose = 2;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
while( (n_threads[0]*(n_threads[1]+1)<=256)
......@@ -1442,10 +1448,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_0101(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads[0] * n_threads[1] <= 256)
{
......@@ -1459,6 +1466,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_100(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
......@@ -1477,7 +1485,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# use blockIdx.y for i2
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
......@@ -1527,10 +1535,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_110(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
while (n_threads[0]*n_threads[1] <= 256)
{
......@@ -1546,10 +1555,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_001(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
while (n_blocks[0] * n_blocks[1] <= 4096)
......@@ -1564,13 +1574,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_101(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail,
extra_dims=[("size_t one = 1;", "(void *) &one")],
extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
pattern="1011")
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
// size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
// (size_t) 256), 1, 1};
size_t n_threads[3] = {1, 1, 1};
......@@ -1592,10 +1603,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_111(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
//get as many y threads as we can fit
......@@ -1624,13 +1636,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_0011(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
......@@ -1653,10 +1666,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_1111(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
//get as many y threads as we can fit
......@@ -1686,10 +1700,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_reduce_1011(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail)
print("""
{
int verbose = 0;
int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
......@@ -1708,7 +1723,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_cache_version_apply(self, node):
version = [21] # the version corresponding to the c code in this Op
version = [24, self.verbose] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(
......@@ -1729,6 +1744,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype)
assign_dtype = in_dtype
flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
......@@ -1744,7 +1760,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO()
......@@ -1758,9 +1774,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
......@@ -1783,7 +1799,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename
sio = StringIO()
......@@ -1798,9 +1814,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
......@@ -1824,7 +1840,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename
sio = StringIO()
......@@ -1839,9 +1855,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
......@@ -1911,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
for i in xrange(nd_in)])
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals(), assign_dtype)
reduce_fct = self._assign_reduce(
node, nodename, "myresult",
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
......@@ -1948,7 +1964,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])", assign_dtype)
kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename
sio = StringIO()
......@@ -1995,7 +2011,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
{}, True)
reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])", assign_dtype)
kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO()
......@@ -2010,9 +2026,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
%(acc_type)s myresult = 0;
X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = 0;
for (int a = blockIdx.x; a < A; a += gridDim.x)
{
......@@ -2062,7 +2078,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])", assign_dtype)
sio = StringIO()
print("""#include "cluda.h"
......@@ -2096,7 +2112,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])", assign_dtype)
kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename
sio = StringIO()
......@@ -2112,9 +2128,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
......@@ -2144,7 +2160,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])", assign_dtype)
sio = StringIO()
print("""#include "cluda.h"
......@@ -2175,7 +2191,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
sio = StringIO()
print("""#include "cluda.h"
......@@ -2206,7 +2222,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename
sio = StringIO()
......@@ -2257,7 +2273,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
sio = StringIO()
print("""#include "cluda.h"
......@@ -2294,7 +2310,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])", assign_dtype)
sio = StringIO()
print("""#include "cluda.h"
......@@ -2329,7 +2345,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
sio = StringIO()
print("""#include "cluda.h"
......@@ -2359,7 +2375,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])", assign_dtype)
kname = "kernel_reduce_1011"
k_var = "kernel_reduce_1011_" + nodename
sio = StringIO()
......@@ -2375,9 +2391,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
......
......@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
return False
x, = inputs
idtype = x.dtype
adtype = getattr(op, 'acc_dtype', None)
adtype = getattr(op, 'acc_dtype', idtype)
odtype = getattr(op, 'dtype', outputs[0].dtype)
# Force accumulator to float32 for float32 inputs since tree
......@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
def local_gpu_elemwise_careduce(node):
"""
Merge some GpuCAReduceCuda and GPUElemwise.
Currently merged:
- SUM(X^2)
- SUM(ABS(X))
"""
if (isinstance(node.op, GpuCAReduceCuda) and
......@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably results
# in slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
scalar.basic.Abs))):
inp = node.inputs[0].owner.inputs[0]
props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr
props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
with inherit_stack_trace(node.outputs):
out = GpuCAReduceCuda(**props)(inp)
return [out]
......
......@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt():
yield dnn_reduction, 2, idtype, adtype, odtype
def test_dnn_reduction_sum_squares():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = (M**2).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm2'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
def test_dnn_reduction_sum_abs():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
def test_dnn_reduction_absmax():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).max(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'absmax'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp),
......
......@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op():
def test_local_gpu_elemwise_careduce():
mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
x = theano.tensor.matrix()
o = (x * x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(f(data), (data * data).sum())
o = (x * x).sum(axis=1)
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
assert _check_stack_trace(f)
utt.assert_allclose(f(data), (data * data).sum(axis=1))
def fn_sum_square(x, axis):
return (x * x).sum(axis=axis)
def fn_sum_abs(x, axis):
return abs(x).sum(axis=axis)
def fn_max_abs(x, axis):
return abs(x).max(axis=axis)
for fn, pre_scalar_op in ((fn_sum_square, theano.scalar.sqr),
(fn_sum_abs, theano.scalar.abs_),
(fn_max_abs, theano.scalar.abs_)):
for axis in (None, 0, 1):
o = fn(x, axis)
f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[1].op, GpuCAReduceCuda)
assert topo[1].op.pre_scalar_op == pre_scalar_op
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(fn(data, axis), f(data))
def test_local_lift_dot22scalar():
......
......@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
Prod, ProdWithoutZeros)
from theano.tests import unittest_tools
from theano.tests.unittest_tools import attr
import theano.tests.unittest_tools as utt
def FunctionGraph(i, o):
......@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
try:
f_xv = f(xv)
self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
self.assertTrue(np.allclose(f_xv, zv),
(f_xv, zv, xsh, tosum))
utt.assert_allclose(zv, f_xv)
except NotImplementedError:
# GpuCAReduce don't implement all cases when size is 0
assert xv.size == 0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论