提交 184216ae authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2

Optimize SUM(x^2), SUM(ABS(X)) and MAX(ABS(X)) to cuDNN reduction.
...@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase): ...@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase):
assert(dims[2] == 1); assert(dims[2] == 1);
assert(dims[1] == 1); assert(dims[1] == 1);
%(b)s = pygpu_view(%(w)s, Py_None); %(b)s = pygpu_view(%(w)s, Py_None);
%(b)s->ga.offset = off; %(b)s->ga.offset += off;
%(b)s->ga.dimensions[0] = dims[0]; %(b)s->ga.dimensions[0] = dims[0];
GpuArray_fix_flags(&%(b)s->ga); GpuArray_fix_flags(&%(b)s->ga);
bshp = dims[0]; bshp = dims[0];
...@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase): ...@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase):
assert(dims[2] == 1); assert(dims[2] == 1);
// We assume that the typecode matches // We assume that the typecode matches
%(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1); %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
%(m)s->ga.offset = off; %(m)s->ga.offset += off;
assert(dims[0] %% bshp == 0); assert(dims[0] %% bshp == 0);
%(m)s->ga.dimensions[0] = dims[0] / bshp; %(m)s->ga.dimensions[0] = dims[0] / bshp;
%(m)s->ga.dimensions[1] = bshp; %(m)s->ga.dimensions[1] = bshp;
...@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase): ...@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (3, version()) return (4, version())
def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode): def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
...@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node): ...@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node):
node.op.acc_dtype == 'float64'): node.op.acc_dtype == 'float64'):
return return
def _identity(a):
return a
def _square(a):
return GpuElemwise(theano.scalar.basic.sqr)(a)
scal = node.op.scalar_op.name
post = _identity
if node.op.pre_scalar_op is not None: if node.op.pre_scalar_op is not None:
# Might want to handle absmax, avg, norm1, norm2 here # Might want to handle absmax, avg, and other cases for (norm1, norm2) here
if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
scal = 'norm2'
post = _square
elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
scal = 'norm1'
else:
return
elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
scal = 'absmax'
else:
return return
if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name): if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return return
with inherit_stack_trace(node.outputs): with inherit_stack_trace(node.outputs):
return (GpuDnnReduction(node.op.scalar_op.name, ret = GpuDnnReduction(scal,
node.op.axis, node.op.axis,
node.op.acc_dtype, node.op.acc_dtype,
node.op.dtype, node.op.dtype,
False)(node.inputs[0]),) False)(node.inputs[0])
return [post(ret)]
@register_opt('cudnn') @register_opt('cudnn')
......
...@@ -492,6 +492,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -492,6 +492,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
__props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op', __props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
'pre_scalar_op') 'pre_scalar_op')
_f16_ok = True _f16_ok = True
verbose = 0
def __init__(self, scalar_op, axis=None, def __init__(self, scalar_op, axis=None,
reduce_mask=None, dtype=None, acc_dtype=None, reduce_mask=None, dtype=None, acc_dtype=None,
...@@ -927,12 +928,12 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -927,12 +928,12 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadNum = threadIdx.z * blockDim.x * blockDim.y const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x; + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = 0;
""" % locals() """ % locals()
def _assign_init(self, first_item): def _assign_init(self, first_item, dtype):
""" """
This return the initial value for myresult. This return the initial value for myresult.
If the scalar op have an identity value, return it. If the scalar op have an identity value, return it.
...@@ -949,7 +950,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -949,7 +950,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
scalar.Minimum)) scalar.Minimum))
if self.pre_scalar_op: # TODO: multiple dtypes if self.pre_scalar_op: # TODO: multiple dtypes
# dtype = node.inputs[0].dtype # dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scalar.Scalar(dtype=dtype)() dummy_var = scalar.Scalar(dtype=dtype)()
...@@ -1093,6 +1093,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1093,6 +1093,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals() """ % locals()
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
verbose = self.verbose
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
if getattr(self.scalar_op, 'identity', None) == 0: if getattr(self.scalar_op, 'identity', None) == 0:
...@@ -1121,7 +1122,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1121,7 +1122,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if(PyGpuArray_SIZE(%(x)s)==0){ if(PyGpuArray_SIZE(%(x)s)==0){
%(zero_shp)s; %(zero_shp)s;
}else{ }else{
int verbose = 0; int verbose = %(verbose)s;
size_t numEls = PyGpuArray_SIZE(%(x)s); size_t numEls = PyGpuArray_SIZE(%(x)s);
size_t n_threads = std::min(numEls, (size_t) 256); size_t n_threads = std::min(numEls, (size_t) 256);
size_t n_blocks = 1; size_t n_blocks = 1;
...@@ -1142,10 +1143,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1142,10 +1143,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_1(self, sio, node, name, x, z, fail): def c_code_reduce_1(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {1, 1, 1}; size_t n_blocks[3] = {1, 1, 1};
%(makecall)s %(makecall)s
...@@ -1153,10 +1155,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1153,10 +1155,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_11(self, sio, node, name, x, z, fail): def c_code_reduce_11(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1]; while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
...@@ -1181,6 +1184,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1181,6 +1184,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" """
assert N in [1, 2, 3] assert N in [1, 2, 3]
verbose = self.verbose
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
...@@ -1222,7 +1226,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1222,7 +1226,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
%(threads_y)s %(threads_y)s
%(threads_z)s %(threads_z)s
...@@ -1241,6 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1241,6 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
def c_code_reduce_10(self, sio, node, name, x, z, fail): def c_code_reduce_10(self, sio, node, name, x, z, fail):
verbose = self.verbose
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
...@@ -1256,7 +1261,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1256,7 +1261,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
if(PyGpuArray_STRIDES(%(x)s)[0]> if(PyGpuArray_STRIDES(%(x)s)[0]>
PyGpuArray_STRIDES(%(x)s)[1]){ PyGpuArray_STRIDES(%(x)s)[1]){
// If there are a lot of summations to do, then we can use simple parallelization - // If there are a lot of summations to do, then we can use simple parallelization -
...@@ -1334,6 +1339,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1334,6 +1339,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_010(self, sio, node, name, x, z, fail): def c_code_reduce_010(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
makecall_inner = self._makecall(node, name, x, z, fail, makecall_inner = self._makecall(node, name, x, z, fail,
pattern="010_inner") pattern="010_inner")
...@@ -1400,7 +1406,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1400,7 +1406,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
} }
else else
{ {
int verbose = 2; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1}; size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
while( (n_threads[0]*(n_threads[1]+1)<=256) while( (n_threads[0]*(n_threads[1]+1)<=256)
...@@ -1442,10 +1448,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1442,10 +1448,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_0101(self, sio, node, name, x, z, fail): def c_code_reduce_0101(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads[0] * n_threads[1] <= 256) while (n_threads[0] * n_threads[1] <= 256)
{ {
...@@ -1459,6 +1466,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1459,6 +1466,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_100(self, sio, node, name, x, z, fail): def c_code_reduce_100(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
...@@ -1477,7 +1485,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1477,7 +1485,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# use blockIdx.y for i2 # use blockIdx.y for i2
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){ if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
...@@ -1527,10 +1535,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1527,10 +1535,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_110(self, sio, node, name, x, z, fail): def c_code_reduce_110(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
while (n_threads[0]*n_threads[1] <= 256) while (n_threads[0]*n_threads[1] <= 256)
{ {
...@@ -1546,10 +1555,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1546,10 +1555,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_001(self, sio, node, name, x, z, fail): def c_code_reduce_001(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
while (n_blocks[0] * n_blocks[1] <= 4096) while (n_blocks[0] * n_blocks[1] <= 4096)
...@@ -1564,13 +1574,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1564,13 +1574,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_101(self, sio, node, name, x, z, fail): def c_code_reduce_101(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail, makecall = self._makecall(node, name, x, z, fail,
extra_dims=[("size_t one = 1;", "(void *) &one")], extra_dims=[("size_t one = 1;", "(void *) &one")],
extra_strides=[("ssize_t sone = 1;", "(void *) &sone")], extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
pattern="1011") pattern="1011")
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
// size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], // size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
// (size_t) 256), 1, 1}; // (size_t) 256), 1, 1};
size_t n_threads[3] = {1, 1, 1}; size_t n_threads[3] = {1, 1, 1};
...@@ -1592,10 +1603,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1592,10 +1603,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_111(self, sio, node, name, x, z, fail): def c_code_reduce_111(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
//get as many y threads as we can fit //get as many y threads as we can fit
...@@ -1624,13 +1636,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1624,13 +1636,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_0011(self, sio, node, name, x, z, fail): def c_code_reduce_0011(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
...@@ -1653,10 +1666,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1653,10 +1666,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_1111(self, sio, node, name, x, z, fail): def c_code_reduce_1111(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
//get as many y threads as we can fit //get as many y threads as we can fit
...@@ -1686,10 +1700,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1686,10 +1700,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_1011(self, sio, node, name, x, z, fail): def c_code_reduce_1011(self, sio, node, name, x, z, fail):
verbose = self.verbose
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
{ {
int verbose = 0; int verbose = %(verbose)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1]; while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
...@@ -1708,7 +1723,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1708,7 +1723,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [21] # the version corresponding to the c code in this Op version = [24, self.verbose] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply( scalar_node = Apply(
...@@ -1729,6 +1744,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1729,6 +1744,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype = node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
assign_dtype = in_dtype
flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype) flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype) in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype) out_type = gpuarray.dtype_to_ctype(out_dtype)
...@@ -1744,7 +1760,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1744,7 +1760,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0])", load_in + "(A[i0])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
kname = "kernel_reduce_ccontig" kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO() sio = StringIO()
...@@ -1758,9 +1774,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1758,9 +1774,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_type)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{ {
...@@ -1783,7 +1799,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1783,7 +1799,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0])", load_in + "(A[i0 * sA0])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
kname = "kernel_reduce_1" kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename k_var = "kernel_reduce_1_" + nodename
sio = StringIO() sio = StringIO()
...@@ -1798,9 +1814,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1798,9 +1814,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_type)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{ {
...@@ -1824,7 +1840,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1824,7 +1840,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1])", load_in + "(A[i0 * sA0 + i1 * sA1])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
kname = "kernel_reduce_11" kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename k_var = "kernel_reduce_11_" + nodename
sio = StringIO() sio = StringIO()
...@@ -1839,9 +1855,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1839,9 +1855,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x; const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{ {
...@@ -1911,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1911,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
for i in xrange(nd_in)]) for i in xrange(nd_in)])
decl, kname, params, k_var = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals()) reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals(), assign_dtype)
reduce_fct = self._assign_reduce( reduce_fct = self._assign_reduce(
node, nodename, "myresult", node, nodename, "myresult",
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])", load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
...@@ -1948,7 +1964,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1948,7 +1964,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])", assign_dtype)
kname = "kernel_reduce_010" kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename k_var = "kernel_reduce_010_" + nodename
sio = StringIO() sio = StringIO()
...@@ -1995,7 +2011,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1995,7 +2011,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(X[a * sX0 + b * sX1 + c * sX2])", load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])") reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])", assign_dtype)
kname = "kernel_reduce_010_AD" kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO() sio = StringIO()
...@@ -2010,9 +2026,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2010,9 +2026,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
%(acc_type)s myresult = 0;
X = (const %(in_type)s *)(((char *)X)+offset_X); X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = 0;
for (int a = blockIdx.x; a < A; a += gridDim.x) for (int a = blockIdx.x; a < A; a += gridDim.x)
{ {
...@@ -2062,7 +2078,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2062,7 +2078,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])", assign_dtype)
sio = StringIO() sio = StringIO()
print("""#include "cluda.h" print("""#include "cluda.h"
...@@ -2096,7 +2112,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2096,7 +2112,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])", assign_dtype)
kname = "kernel_reduce_110" kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename k_var = "kernel_reduce_110_" + nodename
sio = StringIO() sio = StringIO()
...@@ -2112,9 +2128,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2112,9 +2128,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{ {
...@@ -2144,7 +2160,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2144,7 +2160,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])", assign_dtype)
sio = StringIO() sio = StringIO()
print("""#include "cluda.h" print("""#include "cluda.h"
...@@ -2175,7 +2191,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2175,7 +2191,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
sio = StringIO() sio = StringIO()
print("""#include "cluda.h" print("""#include "cluda.h"
...@@ -2206,7 +2222,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2206,7 +2222,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
kname = "kernel_reduce_001" kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename k_var = "kernel_reduce_001_" + nodename
sio = StringIO() sio = StringIO()
...@@ -2257,7 +2273,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2257,7 +2273,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
sio = StringIO() sio = StringIO()
print("""#include "cluda.h" print("""#include "cluda.h"
...@@ -2294,7 +2310,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2294,7 +2310,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])", assign_dtype)
sio = StringIO() sio = StringIO()
print("""#include "cluda.h" print("""#include "cluda.h"
...@@ -2329,7 +2345,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2329,7 +2345,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
sio = StringIO() sio = StringIO()
print("""#include "cluda.h" print("""#include "cluda.h"
...@@ -2359,7 +2375,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2359,7 +2375,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])", assign_dtype)
kname = "kernel_reduce_1011" kname = "kernel_reduce_1011"
k_var = "kernel_reduce_1011_" + nodename k_var = "kernel_reduce_1011_" + nodename
sio = StringIO() sio = StringIO()
...@@ -2375,9 +2391,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2375,9 +2391,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_type)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
%(acc_type)s myresult = %(reduce_init)s;
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{ {
......
...@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
return False return False
x, = inputs x, = inputs
idtype = x.dtype idtype = x.dtype
adtype = getattr(op, 'acc_dtype', None) adtype = getattr(op, 'acc_dtype', idtype)
odtype = getattr(op, 'dtype', outputs[0].dtype) odtype = getattr(op, 'dtype', outputs[0].dtype)
# Force accumulator to float32 for float32 inputs since tree # Force accumulator to float32 for float32 inputs since tree
...@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs): ...@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
def local_gpu_elemwise_careduce(node): def local_gpu_elemwise_careduce(node):
""" """
Merge some GpuCAReduceCuda and GPUElemwise. Merge some GpuCAReduceCuda and GPUElemwise.
Currently merged:
- SUM(X^2)
- SUM(ABS(X))
""" """
if (isinstance(node.op, GpuCAReduceCuda) and if (isinstance(node.op, GpuCAReduceCuda) and
...@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node): ...@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably results # operation with some reduction pattern will probably results
# in slow down. # in slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
scalar.basic.Abs))):
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
props = node.op._props_dict() props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
with inherit_stack_trace(node.outputs): with inherit_stack_trace(node.outputs):
out = GpuCAReduceCuda(**props)(inp) out = GpuCAReduceCuda(**props)(inp)
return [out] return [out]
......
...@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt(): ...@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt():
yield dnn_reduction, 2, idtype, adtype, odtype yield dnn_reduction, 2, idtype, adtype, odtype
def test_dnn_reduction_sum_squares():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = (M**2).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm2'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
def test_dnn_reduction_sum_abs():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
def test_dnn_reduction_absmax():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).max(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'absmax'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
def dnn_reduction_strides(shp, shuffle, slice): def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed() utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp), inp = GpuArrayType('float32', (False,) * len(shp),
......
...@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op(): ...@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op():
def test_local_gpu_elemwise_careduce(): def test_local_gpu_elemwise_careduce():
mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
x = theano.tensor.matrix() x = theano.tensor.matrix()
o = (x * x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(f(data), (data * data).sum())
o = (x * x).sum(axis=1) def fn_sum_square(x, axis):
f = theano.function([x], o, mode=mode_with_gpu) return (x * x).sum(axis=axis)
def fn_sum_abs(x, axis):
return abs(x).sum(axis=axis)
def fn_max_abs(x, axis):
return abs(x).max(axis=axis)
for fn, pre_scalar_op in ((fn_sum_square, theano.scalar.sqr),
(fn_sum_abs, theano.scalar.abs_),
(fn_max_abs, theano.scalar.abs_)):
for axis in (None, 0, 1):
o = fn(x, axis)
f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 3 assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr assert isinstance(topo[1].op, GpuCAReduceCuda)
assert topo[1].op.pre_scalar_op == pre_scalar_op
assert _check_stack_trace(f) assert _check_stack_trace(f)
utt.assert_allclose(f(data), (data * data).sum(axis=1)) data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(fn(data, axis), f(data))
def test_local_lift_dot22scalar(): def test_local_lift_dot22scalar():
......
...@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle, ...@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
Prod, ProdWithoutZeros) Prod, ProdWithoutZeros)
from theano.tests import unittest_tools from theano.tests import unittest_tools
from theano.tests.unittest_tools import attr from theano.tests.unittest_tools import attr
import theano.tests.unittest_tools as utt
def FunctionGraph(i, o): def FunctionGraph(i, o):
...@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
try: try:
f_xv = f(xv) f_xv = f(xv)
self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv)) self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
self.assertTrue(np.allclose(f_xv, zv), utt.assert_allclose(zv, f_xv)
(f_xv, zv, xsh, tosum))
except NotImplementedError: except NotImplementedError:
# GpuCAReduce don't implement all cases when size is 0 # GpuCAReduce don't implement all cases when size is 0
assert xv.size == 0 assert xv.size == 0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论