提交 6ec28cf2 authored 作者: abergeron's avatar abergeron

Merge pull request #3743 from nouiz/reduce_101_newbackend

Reduce 101 newbackend
......@@ -714,7 +714,7 @@ class PureOp(object):
"own op, implement the R_op method." %
(self, self.__class__.__name__))
def perform(self, node, inputs, output_storage):
def perform(self, node, inputs, output_storage, params=None):
"""
Required: Calculate the function on the inputs and put the variables in
the output storage. Return None.
......@@ -746,7 +746,10 @@ class PureOp(object):
The subclass does not override this method.
"""
raise utils.MethodNotDefined("perform", type(self), self.__class__.__name__)
raise utils.MethodNotDefined(
"perform", type(self), self.__class__.__name__,
"Did you used Theano flags mode=FAST_COMPILE?"
" You can use optimizer=fast_compile instead.")
def do_constant_folding(self, node):
"""
......
......@@ -117,14 +117,6 @@ def test_careduce():
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# test pattern implemented by reshape
((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]), # 1000
((4100, 4, 3, 2), [1]), ((4, 4100, 3, 2), [1]), ((4, 3, 4100, 2), [1]), ((4, 3, 2, 4100), [1]), # 0100
......@@ -132,6 +124,8 @@ def test_careduce():
((4100, 4, 3, 2), [3]), ((4, 4100, 3, 2), [3]), ((4, 3, 4100, 2), [3]), ((4, 3, 2, 4100), [3]), # 0001
# reduce over 2d
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110
((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
......@@ -141,9 +135,12 @@ def test_careduce():
# 3d not tested: 1101, 1110, 1111
((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101
((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
# reduce over 4d
((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]), # 1111
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# reduce over 5d
((1100, 2, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 1100, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 1100, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 1100, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 5, 1100), [0, 1, 2, 3, 4]), # 11111
......
......@@ -3,6 +3,7 @@ import copy
from theano.compat import izip
import numpy
import theano
from theano import Apply, scalar, config
from theano import scalar as scal
from six.moves import StringIO, xrange
......@@ -654,7 +655,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return node.inputs[0].type.context
def perform(self, node, inp, out, ctx):
raise MethodNotDefined("")
theano.Op.perform(self, node, inp, out, ctx)
def supports_c_code(self, inputs):
"""
......@@ -833,7 +834,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return sio.getvalue()
def _makecall(self, node, name, x, z, fail, pattern=None):
def _makecall(self, node, name, x, z, fail, pattern=None, extra_dims=(), extra_strides=()):
"""
Return a string for making a kernel call.
......@@ -876,6 +877,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
for i in xrange(ndim):
params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
for declaration, value in extra_dims:
print(declaration % locals(), file=sio)
params.append(value)
params.append("(void *)%(x)s->ga.data" % locals())
params.append("(void *)&%(x)s->ga.offset" % locals())
for i in xrange(ndim):
......@@ -883,6 +887,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
""" % locals(), file=sio)
params.append("(void *)&stride_A%(i)d" % locals())
for declaration, value in extra_strides:
print(declaration % locals(), file=sio)
params.append(value)
params.append("(void *)%(z)s->ga.data" % locals())
params.append("(void *)&%(z)s->ga.offset" % locals())
......@@ -1779,6 +1786,34 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
}
""" % locals(), file=sio)
def c_code_reduce_101(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail,
extra_dims=[("size_t one = 1;", "(void *) &one")],
extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
pattern="1011")
print("""
{
int verbose = 0;
// size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
// (size_t) 256), 1, 1};
size_t n_threads[3] = {1, 1, 1};
while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
++n_threads[2];
if (n_threads[2] > 64)
n_threads[2] = 64;
if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
def c_code_reduce_111(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
print("""
......@@ -2572,7 +2607,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 1, 1):
if self.reduce_mask == (1, 0, 1, 1) or self.reduce_mask == (1, 0, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
......
......@@ -83,6 +83,7 @@ class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
op = GpuCAReduceCPY
reds = [scalar.add, scalar.mul]
pre_scalar_op = None
mode = mode_with_gpu
def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes:
......@@ -172,16 +173,28 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011
# ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100, 4, 3), [0, 2]), ((5, 4100, 3), [0, 2]), ((5, 4, 4100), [0, 2]), # 101
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
# reduce over 2d
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011
# ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 by reshape
# ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]), # 1001 by reshape
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]), # 1010 not implemented
# ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100 by reshape
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]), # 1101 by reshape
# ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 by reshape
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]), # 1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 2, 3), [0, 1, 2, 3]), # 1111
# reduce over 4d
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论