提交 6ec28cf2 authored 作者: abergeron's avatar abergeron

Merge pull request #3743 from nouiz/reduce_101_newbackend

Reduce 101 newbackend
...@@ -714,7 +714,7 @@ class PureOp(object): ...@@ -714,7 +714,7 @@ class PureOp(object):
"own op, implement the R_op method." % "own op, implement the R_op method." %
(self, self.__class__.__name__)) (self, self.__class__.__name__))
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage, params=None):
""" """
Required: Calculate the function on the inputs and put the variables in Required: Calculate the function on the inputs and put the variables in
the output storage. Return None. the output storage. Return None.
...@@ -746,7 +746,10 @@ class PureOp(object): ...@@ -746,7 +746,10 @@ class PureOp(object):
The subclass does not override this method. The subclass does not override this method.
""" """
raise utils.MethodNotDefined("perform", type(self), self.__class__.__name__) raise utils.MethodNotDefined(
"perform", type(self), self.__class__.__name__,
"Did you used Theano flags mode=FAST_COMPILE?"
" You can use optimizer=fast_compile instead.")
def do_constant_folding(self, node): def do_constant_folding(self, node):
""" """
......
...@@ -117,14 +117,6 @@ def test_careduce(): ...@@ -117,14 +117,6 @@ def test_careduce():
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111 ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111 ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# test pattern implemented by reshape # test pattern implemented by reshape
((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]), # 1000 ((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]), # 1000
((4100, 4, 3, 2), [1]), ((4, 4100, 3, 2), [1]), ((4, 3, 4100, 2), [1]), ((4, 3, 2, 4100), [1]), # 0100 ((4100, 4, 3, 2), [1]), ((4, 4100, 3, 2), [1]), ((4, 3, 4100, 2), [1]), ((4, 3, 2, 4100), [1]), # 0100
...@@ -132,6 +124,8 @@ def test_careduce(): ...@@ -132,6 +124,8 @@ def test_careduce():
((4100, 4, 3, 2), [3]), ((4, 4100, 3, 2), [3]), ((4, 3, 4100, 2), [3]), ((4, 3, 2, 4100), [3]), # 0001 ((4100, 4, 3, 2), [3]), ((4, 4100, 3, 2), [3]), ((4, 3, 4100, 2), [3]), ((4, 3, 2, 4100), [3]), # 0001
# reduce over 2d # reduce over 2d
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110
((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001 ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented # ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
...@@ -141,9 +135,12 @@ def test_careduce(): ...@@ -141,9 +135,12 @@ def test_careduce():
# 3d not tested: 1101, 1110, 1111 # 3d not tested: 1101, 1110, 1111
((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101 ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101
((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
# reduce over 4d # reduce over 4d
((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]), # 1111 ((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# reduce over 5d # reduce over 5d
((1100, 2, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 1100, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 1100, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 1100, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 5, 1100), [0, 1, 2, 3, 4]), # 11111 ((1100, 2, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 1100, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 1100, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 1100, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 5, 1100), [0, 1, 2, 3, 4]), # 11111
......
...@@ -3,6 +3,7 @@ import copy ...@@ -3,6 +3,7 @@ import copy
from theano.compat import izip from theano.compat import izip
import numpy import numpy
import theano
from theano import Apply, scalar, config from theano import Apply, scalar, config
from theano import scalar as scal from theano import scalar as scal
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
...@@ -654,7 +655,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -654,7 +655,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return node.inputs[0].type.context return node.inputs[0].type.context
def perform(self, node, inp, out, ctx): def perform(self, node, inp, out, ctx):
raise MethodNotDefined("") theano.Op.perform(self, node, inp, out, ctx)
def supports_c_code(self, inputs): def supports_c_code(self, inputs):
""" """
...@@ -833,7 +834,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -833,7 +834,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return sio.getvalue() return sio.getvalue()
def _makecall(self, node, name, x, z, fail, pattern=None): def _makecall(self, node, name, x, z, fail, pattern=None, extra_dims=(), extra_strides=()):
""" """
Return a string for making a kernel call. Return a string for making a kernel call.
...@@ -876,6 +877,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -876,6 +877,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
for i in xrange(ndim): for i in xrange(ndim):
params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals()) params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
for declaration, value in extra_dims:
print(declaration % locals(), file=sio)
params.append(value)
params.append("(void *)%(x)s->ga.data" % locals()) params.append("(void *)%(x)s->ga.data" % locals())
params.append("(void *)&%(x)s->ga.offset" % locals()) params.append("(void *)&%(x)s->ga.offset" % locals())
for i in xrange(ndim): for i in xrange(ndim):
...@@ -883,6 +887,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -883,6 +887,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s); ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
""" % locals(), file=sio) """ % locals(), file=sio)
params.append("(void *)&stride_A%(i)d" % locals()) params.append("(void *)&stride_A%(i)d" % locals())
for declaration, value in extra_strides:
print(declaration % locals(), file=sio)
params.append(value)
params.append("(void *)%(z)s->ga.data" % locals()) params.append("(void *)%(z)s->ga.data" % locals())
params.append("(void *)&%(z)s->ga.offset" % locals()) params.append("(void *)&%(z)s->ga.offset" % locals())
...@@ -1779,6 +1786,34 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1779,6 +1786,34 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_101(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail,
extra_dims=[("size_t one = 1;", "(void *) &one")],
extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
pattern="1011")
print("""
{
int verbose = 0;
// size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
// (size_t) 256), 1, 1};
size_t n_threads[3] = {1, 1, 1};
while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
++n_threads[2];
if (n_threads[2] > 64)
n_threads[2] = 64;
if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
def c_code_reduce_111(self, sio, node, name, x, z, fail): def c_code_reduce_111(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
...@@ -2572,7 +2607,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2572,7 +2607,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname, kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var)) params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1) or self.reduce_mask == (1, 0, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]', reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
......
...@@ -83,6 +83,7 @@ class test_GpuCAReduceCPY(test_elemwise.test_CAReduce): ...@@ -83,6 +83,7 @@ class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
op = GpuCAReduceCPY op = GpuCAReduceCPY
reds = [scalar.add, scalar.mul] reds = [scalar.add, scalar.mul]
pre_scalar_op = None pre_scalar_op = None
mode = mode_with_gpu
def test_perform(self): def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes: for dtype in self.dtypes + self.bin_dtypes:
...@@ -172,16 +173,28 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -172,16 +173,28 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001 ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110 ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011 ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011
# ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100, 4, 3), [0, 2]), ((5, 4100, 3), [0, 2]), ((5, 4, 4100), [0, 2]), # 101
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111 ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111 ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
# reduce over 2d
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011 ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101 ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011 # ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 by reshape
# ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]), # 1001 by reshape
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]), # 1010 not implemented
# ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100 by reshape
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]), # 1101 by reshape
# ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 by reshape
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]), # 1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111 ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111 ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 2, 3), [0, 1, 2, 3]), # 1111
# reduce over 4d
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# test pattern implemented by reshape # test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape # Skip them as this test the op directly, not the optimization with reshape
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论