提交 90dd93d0 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5317 from khaotik/cumop

Merge CumsumOp/CumprodOp into CumOp
from __future__ import absolute_import, print_function, division
import os
from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp
from theano.tensor.extra_ops import CumOp
from .basic_ops import infer_context_name
try:
from pygpu import gpuarray
......@@ -12,7 +12,7 @@ from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
from .opt import register_opt, op_lifter, register_opt2
class GpuCumsum(GpuKernelBase, Op):
class GpuCumOp(GpuKernelBase, Op):
"""
Parameters
----------
......@@ -20,10 +20,19 @@ class GpuCumsum(GpuKernelBase, Op):
Can not be None. If you want the array flattened, do it before.
"""
SUPPORTED_NDIMS = 3
__props__ = ('axis',)
__props__ = ('axis', 'mode')
def __init__(self, axis):
self.axis = axis
def __init__(self, axis, mode='add'):
self.axis = axis if axis else 0
self.mode = mode
def __eq__(self, other):
if type(other) != type(self):
return False
return self.axis == other.axis and self.mode == other.mode
def __hash__(self):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self):
return (3,)
......@@ -38,14 +47,14 @@ class GpuCumsum(GpuKernelBase, Op):
return node.inputs[0].type.context
def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\
if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
raise NotImplementedError('Only cum op on 1D, 2D and\
3D arrays are supported right now!')
if self.axis >= x.ndim or self.axis < -x.ndim:
......@@ -56,6 +65,7 @@ class GpuCumsum(GpuKernelBase, Op):
kernels = []
# cumadd
kname = "k_cumadd"
op = {'mul': '*', 'add': '+'}[self.mode]
k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
......@@ -75,7 +85,7 @@ class GpuCumsum(GpuKernelBase, Op):
int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
......@@ -86,9 +96,9 @@ class GpuCumsum(GpuKernelBase, Op):
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
# blockCumSum
kname = "k_blockCumSum"
k_var = "k_blockCumSum_" + nodename
# blockCumOp
kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
......@@ -96,109 +106,108 @@ class GpuCumsum(GpuKernelBase, Op):
code = """
// helper functions
WITHIN_KERNEL
void k_reductionPhase(float* partialCumSum) {
void k_reductionPhase(float* partialCumOp) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride];
partialCumOp[index] %(op)s= partialCumOp[index - stride];
}
}
}
WITHIN_KERNEL
void k_fetchData(float* partialCumSum, float* input, int globalThreadID,
void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum
// blockIdx.y and blockIdx.z represents the current independent cum op
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
partialCumSum[threadIdx.x*2] = input[idx_even];
partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
partialCumOp[threadIdx.x*2] = input[idx_even];
partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
}
WITHIN_KERNEL
void k_reversePhase(float* partialCumSum) {
void k_reversePhase(float* partialCumOp) {
// Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index];
partialCumOp[index + stride] %(op)s= partialCumOp[index];
}
}
}
WITHIN_KERNEL
void k_pushData(float* partialCumSum, float* output, int globalThreadID,
void k_pushData(float* partialCumOp, float* output, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
local_barrier();
// blockIdx.y and blockIdx.z represents the current independent cumsum
// blockIdx.y and blockIdx.z represents the current independent cum op
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] = partialCumSum[threadIdx.x*2];
output[idx_odd] = partialCumSum[threadIdx.x*2 + 1];
output[idx_even] = partialCumOp[threadIdx.x*2];
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
}
KERNEL void k_blockCumSum(float* input, float* output,
size_t nbElementsPerCumsum, ga_ssize inputStrides_x,
KERNEL void k_blockCumOp(float* input, float* output,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
// Check if current thread has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
return;
}
extern __shared__ float partialCumSum[];
extern __shared__ float partialCumOp[];
// Load data in shared memory
k_fetchData(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase(partialCumSum);
k_reversePhase(partialCumSum);
k_reductionPhase(partialCumOp);
k_reversePhase(partialCumOp);
// Write the final output to global memory
k_pushData(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1];
blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
}
}
}
"""
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
# k_finalCumSum
kname = "k_finalCumSum"
k_var = "k_finalCumSum_" + nodename
# k_finalCumOp
kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename
code = """
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
if (globalThreadID >= (nbElementsPerCumOp+1)/2)
return;
}
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
......@@ -208,10 +217,10 @@ class GpuCumsum(GpuKernelBase, Op):
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] += currentBlockSum;
output[idx_odd] += currentBlockSum;
output[idx_even] %(op)s= currentBlockSum;
output[idx_odd] %(op)s= currentBlockSum;
}
"""
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ]
......@@ -263,7 +272,7 @@ class GpuCumsum(GpuKernelBase, Op):
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)s;
}
if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
%(fail)s;
}
}
......@@ -274,7 +283,7 @@ class GpuCumsum(GpuKernelBase, Op):
def c_support_code_struct(self, node, nodename):
code = """
int cumSum_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x;
ssize_t inputStrides_y;
......@@ -316,14 +325,14 @@ class GpuCumsum(GpuKernelBase, Op):
int err = pygpu_move(output, input);
return err;
}
// Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Perform cum op on array of even size.
size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block.
size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
// Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
size_t dimGridY; // Nb. of independent cumsums (width).
size_t dimGridZ; // Nb. of independent cumsums (height).
size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX); // Nb. of blocks needed per cum op.
size_t dimGridY; // Nb. of independent cum ops (width).
size_t dimGridZ; // Nb. of independent cum ops (height).
ssize_t tmp;
switch (axis)
{
......@@ -365,18 +374,18 @@ class GpuCumsum(GpuKernelBase, Op):
if (deviceBlockSum == NULL){
return -1;
}
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
// Perform `maxGridY`*`maxGridZ` cum ops in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &nbElementsPerCumOp,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
......@@ -387,39 +396,39 @@ class GpuCumsum(GpuKernelBase, Op):
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data
};
int err = GpuKernel_call(&k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumSum call failed");
PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
return -1;
}
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
// Do a cum op over the blockSum (recursive).
if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
// report partial cum ops of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &nbElementsPerCumOp,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ
};
int err = GpuKernel_call(&k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumSum call failed");
PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
return -1;
}
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
if (shape[axis] != nbElementsPerCumOp){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2;
......@@ -450,26 +459,39 @@ class GpuCumsum(GpuKernelBase, Op):
return 0;
}
""" % locals()
return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
return super(GpuCumOp, self).c_support_code_struct(node, nodename) + code
@register_opt('fast_compile')
@op_lifter([CumsumOp])
@register_opt2([CumsumOp], 'fast_compile')
def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype == 'float32':
axis = op.axis
x = inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None
x = as_gpuarray_variable(x, ctx_name)
# GpuCumsumOp exists only to serve backward compatibility.
# Once an object is created, it will be converted to CumOp object.
class GpuCumsumOp(GpuKernelBase, Op):
SUPPORTED_NDIMS = 3
__props__ = ("axis",)
if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
def __new__(typ, *args, **kwargs):
obj = object.__new__(GpuCumOp, *args, **kwargs)
obj.mode = 'add'
return obj
# ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None:
axis = 0
return GpuCumsum(axis)(x)
@register_opt('fast_compile')
@op_lifter([CumOp])
@register_opt2([CumOp], 'fast_compile')
def local_gpua_cumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype != 'float32':
return False
axis = op.axis
x = inputs[0]
if axis is not None and x.ndim > GpuCumOp.SUPPORTED_NDIMS:
return False
x = as_gpuarray_variable(x, ctx_name)
if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
# ``gpu_cumop`` assume array has been flattened if needed.
if axis is None:
axis = 0
return GpuCumOp(axis, op.mode)(x)
# Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division
import itertools
from functools import partial
from itertools import product
import numpy as np
from six.moves import xrange
......@@ -9,54 +9,62 @@ from theano import tensor as T
import theano
import theano.tensor.tests.test_extra_ops
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tensor.extra_ops import CumOp
from theano.tests.unittest_tools import SkipTest
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu, test_ctx_name
from ..extra_ops import GpuCumsum
from ..extra_ops import GpuCumOp
from ..type import get_context
cum_modes = utt.parameterized.expand([('mul',), ('add',)])
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
class TestGpuCumOp(theano.tensor.tests.test_extra_ops.TestCumOp):
mode = mode_with_gpu
def setUp(self):
super(TestGpuCumsum, self).setUp()
super(TestGpuCumOp, self).setUp()
test_ctx = get_context(test_ctx_name)
if test_ctx.kind != b'cuda':
raise SkipTest("Cuda specific tests")
self.max_threads_dim0 = test_ctx.maxlsize0
self.max_grid_size1 = test_ctx.maxgsize2
self.op_class = GpuCumsum
self.op_class = CumOp
def test_infer_shape(self):
# GpuCumSum is only defined for float32 for now, so we skip it
@cum_modes
def test_infer_shape(self, mode):
# GpuCumOp is only defined for float32 for now, so we skip it
# in the unsupported cases
gpucumsum_supported_dtypes = ('float32',)
if theano.config.floatX not in gpucumsum_supported_dtypes:
raise SkipTest('GpuCumSum not implemented for dtype %s'
op_class = partial(self.op_class, mode=mode)
gpucumop_supported_dtypes = ('float32',)
if theano.config.floatX not in gpucumop_supported_dtypes:
raise SkipTest('Gpucumop not implemented for dtype %s'
% theano.config.floatX)
x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(theano.config.floatX)
for axis in range(-len(a.shape), len(a.shape)):
self._compile_and_check([x],
[cumsum(x, axis=axis)],
[op_class(axis=axis)(x)],
[a],
self.op_class)
GpuCumOp)
def test_grad(self):
# no grad for GpuCumsum
@cum_modes
def test_grad(self, mode):
# no grad for GpuCumOp
pass
def test_Strides1D(self):
@cum_modes
def test_Strides1D(self, mode):
op_class = partial(self.op_class, mode=mode)
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
x = T.fvector('x')
for axis in [0, None, -1]:
a = np.random.random((42,)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode)
cumop_function = theano.function(
[x], op_class(axis=axis)(x), mode=self.mode)
slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
......@@ -64,22 +72,25 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
]
# Cartesian product of all slicings to test.
for slicing in itertools.product(slicings, repeat=x.ndim):
f = theano.function([x], cumsum(x[slicing], axis=axis),
for slicing in product(slicings, repeat=x.ndim):
f = theano.function([x], op_class(axis=axis)(x[slicing]),
mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
cumsum_function(a[slicing]))
def test_Strides2D(self):
if isinstance(n.op, GpuCumOp)]
utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
utt.assert_allclose(np_func(a[slicing], axis=axis),
cumop_function(a[slicing]))
@cum_modes
def test_Strides2D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
x = T.fmatrix('x')
for axis in [0, 1, None, -1, -2]:
a = np.random.random((42, 30)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode)
cumop_function = theano.function(
[x], op_class(axis=axis)(x), mode=self.mode)
slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
......@@ -87,22 +98,25 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
]
# Cartesian product of all slicings to test.
for slicing in itertools.product(slicings, repeat=x.ndim):
f = theano.function([x], cumsum(x[slicing], axis=axis),
for slicing in product(slicings, repeat=x.ndim):
f = theano.function([x], op_class(axis=axis)(x[slicing]),
mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
cumsum_function(a[slicing]))
def test_Strides3D(self):
if isinstance(n.op, GpuCumOp)]
utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
utt.assert_allclose(np_func(a[slicing], axis=axis),
cumop_function(a[slicing]))
@cum_modes
def test_Strides3D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
x = T.ftensor3('x')
for axis in [0, 1, 2, None, -1, -2, -3]:
a = np.random.random((42, 30, 25)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode)
cumop_function = theano.function(
[x], op_class(axis=axis)(x), mode=self.mode)
slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
......@@ -110,45 +124,51 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
]
# Cartesian product of all slicings to test.
for slicing in itertools.product(slicings, repeat=x.ndim):
f = theano.function([x], cumsum(x[slicing], axis=axis),
mode=self.mode)
for slicing in product(slicings, repeat=x.ndim):
f = theano.function(
[x], op_class(axis=axis)(x[slicing]), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
cumsum_function(a[slicing]))
def test_GpuCumsum1D(self):
if isinstance(n.op, GpuCumOp)]
utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
utt.assert_allclose(np_func(a[slicing], axis=axis),
cumop_function(a[slicing]))
@cum_modes
def test_GpuCumOp1D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
block_max_size = self.max_threads_dim0 * 2
x = T.fvector('x')
f = theano.function([x], cumsum(x), mode=self.mode)
f = theano.function([x], op_class(axis=0)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
if isinstance(n.op, GpuCumOp)]
# Extensive testing for the first 1025 sizes
a = np.random.random(1025).astype("float32")
for i in xrange(a.shape[0]):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
utt.assert_allclose(np_func(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks
a = np.random.random((block_max_size + 2, )).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a))
utt.assert_allclose(np_func(a), f(a))
# Use recursive cumsum
# Use recursive cumop
a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a))
utt.assert_allclose(np_func(a), f(a))
def test_GpuCumsum2D(self):
@cum_modes
def test_GpuCumOp2D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
block_max_size = self.max_threads_dim0 * 2
x = T.fmatrix('x')
for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
f = theano.function([x], op_class(axis=axis)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
if isinstance(n.op, GpuCumOp)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5]
......@@ -158,36 +178,39 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
for i in xrange(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np.cumsum(a[slices], axis=axis)
npa = np_func(a[slices], axis=axis)
utt.assert_allclose(npa, fa)
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
utt.assert_allclose(np_func(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [4, 4]
a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
utt.assert_allclose(np_func(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum
# Use recursive cumop
a_shape = [3, 3]
a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
utt.assert_allclose(np_func(a, axis=axis), f(a))
def test_GpuCumsum3D(self):
@cum_modes
def test_GpuCumOp3D(self, mode):
np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
op_class = partial(self.op_class, mode=mode)
block_max_size = self.max_threads_dim0 * 2
x = T.ftensor3('x')
for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]):
f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
f = theano.function([x], op_class(axis=axis)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
if isinstance(n.op, GpuCumOp)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5, 5]
......@@ -197,14 +220,14 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
for i in xrange(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np.cumsum(a[slices], axis=axis)
npa = np_func(a[slices], axis=axis)
utt.assert_allclose(npa, fa)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
utt.assert_allclose(np_func(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5]
......@@ -213,7 +236,7 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
if axis is None:
# Avoid floating point error
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
utt.assert_allclose(np_func(a, axis=axis), f(a))
a_shape = [5, 5, 5]
a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
......@@ -221,18 +244,20 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
if axis is None:
# Avoid floating point error
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
utt.assert_allclose(np_func(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis)
# Use recursive cumop (along accumulation axis)
a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
utt.assert_allclose(np_func(a, axis=axis), f(a))
def test_GpuCumsum4D(self):
@cum_modes
def test_GpuCumOp4D(self, mode):
op_class = partial(self.op_class, mode=mode)
# Should not use the GPU version.
x = T.ftensor4('x')
f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
f = theano.function([x], op_class(axis=1)(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, CumsumOp)]
if isinstance(n.op, GpuCumOp)]
......@@ -5,7 +5,7 @@ from theano import Op
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp
from theano.sandbox.cuda.basic_ops import gpu_flatten
from theano.tensor.extra_ops import CumsumOp
from theano.tensor.extra_ops import CumOp
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
......@@ -13,7 +13,7 @@ if cuda_available:
from theano.sandbox.cuda import register_opt as register_gpu_opt
class GpuCumsum(CumsumOp, GpuOp):
class GpuCumsum(CumOp, GpuOp):
"""
Parameters
......@@ -438,13 +438,16 @@ def values_eq_approx_high_tol(a, b):
@register_gpu_opt()
@local_optimizer([CumsumOp])
@local_optimizer([CumOp])
def use_gpu_cumsum(node):
if type(node.op) is CumsumOp \
if type(node.op) is CumOp \
and node.inputs[0].dtype == 'float32' \
and node.inputs[0].owner \
and isinstance(node.inputs[0].owner.op, HostFromGpu):
if node.op.mode != 'add':
return None
axis = node.op.axis
x = node.inputs[0]
......
......@@ -7,7 +7,7 @@ import numpy as np
from six.moves import xrange
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tensor.extra_ops import cumsum, CumOp
from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available:
......@@ -22,7 +22,7 @@ else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumOp):
mode = mode_with_gpu
def setUp(self):
......@@ -232,4 +232,4 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
x = T.ftensor4('x')
f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, CumsumOp)]
if isinstance(n.op, CumOp)]
......@@ -242,13 +242,16 @@ def searchsorted(x, v, side='left', sorter=None):
return SearchsortedOp(side=side)(x, v, sorter)
class CumsumOp(theano.Op):
# See function cumsum for docstring
class CumOp(theano.Op):
# See function cumsum/cumprod for docstring
__props__ = ("axis",)
__props__ = ("axis", "mode")
def __init__(self, axis=None):
def __init__(self, axis=None, mode='add'):
if mode not in ('add', 'mul'):
raise ValueError('%s: Unknown mode "%s"' % (type(self).__name__, mode))
self.axis = axis
self.mode = mode
def make_node(self, x):
x = basic.as_tensor_variable(x)
......@@ -264,20 +267,39 @@ class CumsumOp(theano.Op):
def perform(self, node, inputs, output_storage):
x = inputs[0]
z = output_storage[0]
z[0] = np.cumsum(x, axis=self.axis)
z[0] = {'add': np.cumsum, 'mul': np.cumprod}[self.mode](x, axis=self.axis)
def grad(self, inputs, output_gradients):
[gi] = output_gradients
x, = inputs
gi, = output_gradients
if self.axis is None:
return [cumsum(gi[::-1])[::-1].reshape(inputs[0].shape)]
if self.mode == 'add':
return [cumsum(gi[::-1])[::-1].reshape(x.shape)]
elif self.mode == 'mul':
fx = cumprod(x, axis=self.axis)
return [cumsum(
(fx * gi)[::-1])[::-1].reshape(x.shape) / x]
else:
raise NotImplementedError(
'%s: unknown gradient for mode "%s"' %
(type(self).__name__, self.mode))
# We need to reverse the gradients along ``self.axis``,
# compute cumsum, then reverse again
reverse_slicing = [slice(None, None, None)] * gi.ndim
reverse_slicing[self.axis] = slice(None, None, -1)
reverse_slicing = tuple(reverse_slicing)
return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
# We need to reverse the gradients along ``self.axis``,
# compute cumsum, then reverse again
if self.mode == 'add':
return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
elif self.mode == 'mul':
fx = cumprod(x, axis=self.axis)
return [cumsum(
(fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x]
else:
raise NotImplementedError(
'%s: unknown gradient for mode "%s"' %
(type(self).__name__, self.mode))
def infer_shape(self, node, shapes):
if self.axis is None:
......@@ -290,6 +312,7 @@ class CumsumOp(theano.Op):
z, = onames
axis = self.axis
fail = sub['fail']
func = dict(mul='CumProd', add='CumSum')[self.mode]
if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
code = """
......@@ -303,13 +326,13 @@ class CumsumOp(theano.Op):
if (!%(z)s)
%(fail)s;
{
PyObject * t = PyArray_CumSum(
PyObject * t = PyArray_%(func)s(
%(x)s, NPY_MAXDIMS,
PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
if (!t){
%(fail)s;
}
// Because PyArray_CumSum returns a newly created reference on t.
// Because PyArray_%(func)s returns a newly created reference on t.
Py_XDECREF(t);
}
""" % locals()
......@@ -325,13 +348,13 @@ class CumsumOp(theano.Op):
%(fail)s;
{
PyObject * t = PyArray_CumSum(
PyObject * t = PyArray_%(func)s(
%(x)s, %(axis)s,
PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
if (!t){
%(fail)s;
}
// Because PyArray_CumSum returns a newly created reference on t.
// Because PyArray_%(func)s returns a newly created reference on t.
Py_XDECREF(t);
}
""" % locals()
......@@ -339,10 +362,10 @@ class CumsumOp(theano.Op):
return code
def c_code_cache_version(self):
return (6,)
return (7,)
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis)
return "%s{%s, %s}" % (self.__class__.__name__, self.axis, self.mode)
def cumsum(x, axis=None):
......@@ -362,112 +385,7 @@ def cumsum(x, axis=None):
.. versionadded:: 0.7
"""
return CumsumOp(axis=axis)(x)
class CumprodOp(theano.Op):
# See function cumprod for docstring
__props__ = ("axis",)
def __init__(self, axis=None):
self.axis = axis
def make_node(self, x):
x = basic.as_tensor_variable(x)
out_type = x.type()
if self.axis is None:
out_type = theano.tensor.vector(dtype=x.dtype) # Flatten
elif self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError('axis(={0}) out of bounds'.format(self.axis))
return theano.Apply(self, [x], [out_type])
def perform(self, node, inputs, output_storage):
x = inputs[0]
z = output_storage[0]
z[0] = np.cumprod(x, axis=self.axis)
def grad(self, inputs, output_gradients):
x, = inputs
gi, = output_gradients
fx = cumprod(x, axis=self.axis)
if self.axis is None:
return [cumsum((fx * gi)[::-1])[::-1].reshape(inputs[0].shape) / x]
# We need to reverse the gradients along ``self.axis``,
# compute cumsum, then reverse again
reverse_slicing = [slice(None, None, None)] * gi.ndim
reverse_slicing[self.axis] = slice(None, None, -1)
reverse_slicing = tuple(reverse_slicing)
return [cumsum((fx * gi)[reverse_slicing],
self.axis)[reverse_slicing] / x]
def infer_shape(self, node, shapes):
if self.axis is None:
return [(tensor.prod(shapes[0]),)] # Flatten
return shapes
def c_code(self, node, name, inames, onames, sub):
x, = inames
z, = onames
axis = self.axis
fail = sub['fail']
if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
code = """
npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
if(!(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
{
Py_XDECREF(%(z)s);
%(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s));
}
if (!%(z)s)
%(fail)s;
{
PyObject * t = PyArray_CumProd(
%(x)s, NPY_MAXDIMS,
PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
if (!t){
%(fail)s;
}
// Because PyArray_CumSum returns a newly created reference on t.
Py_XDECREF(t);
}
""" % locals()
else:
code = """
if(!(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)) ))
{
Py_XDECREF(%(z)s);
%(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE((PyArrayObject*) py_%(x)s));
}
if (!%(z)s)
%(fail)s;
{
PyObject * t = PyArray_CumProd(
%(x)s, %(axis)s,
PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
if (!t){
%(fail)s;
}
// Because PyArray_CumSum returns a newly created reference on t.
Py_XDECREF(t);
}
""" % locals()
return code
def c_code_cache_version(self):
return (4,)
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis)
return CumOp(axis=axis, mode='add')(x)
def cumprod(x, axis=None):
......@@ -488,7 +406,27 @@ def cumprod(x, axis=None):
.. versionadded:: 0.7
"""
return CumprodOp(axis=axis)(x)
return CumOp(axis=axis, mode='mul')(x)
# CumsumOp and CumprodOp are for compatibility with old version,
# just in case unpickling a theano function with old Ops.
class CumsumOp(theano.Op):
__props__ = ("axis",)
def __new__(typ, *args, **kwargs):
obj = object.__new__(CumOp, *args, **kwargs)
obj.mode = 'add'
return obj
class CumprodOp(theano.Op):
__props__ = ("axis",)
def __new__(typ, *args, **kwargs):
obj = object.__new__(CumOp, *args, **kwargs)
obj.mode = 'mul'
return obj
class DiffOp(theano.Op):
......
from __future__ import absolute_import, print_function, division
from functools import partial
import numpy as np
import numpy
......@@ -7,7 +8,7 @@ import theano
from theano.tests import unittest_tools as utt
from theano.tensor.extra_ops import (SearchsortedOp, searchsorted,
CumsumOp, cumsum, CumprodOp, cumprod,
CumOp, cumsum, cumprod,
CpuContiguous, cpu_contiguous,
bincount, DiffOp, diff, squeeze, compress,
RepeatOp, repeat, Bartlett, bartlett,
......@@ -121,74 +122,33 @@ class TestSearchsortedOp(utt.InferShapeTester):
utt.verify_grad(self.op, [self.a[self.idx_sorted], self.b])
class TestCumsumOp(utt.InferShapeTester):
class TestCumOp(utt.InferShapeTester):
def setUp(self):
super(TestCumsumOp, self).setUp()
self.op_class = CumsumOp
self.op = CumsumOp()
super(TestCumOp, self).setUp()
self.op_class = CumOp
self.op = CumOp()
def test_cumsumOp(self):
def test_cum_op(self):
x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(config.floatX)
# Test axis out of bounds
self.assertRaises(ValueError, cumsum, x, axis=3)
self.assertRaises(ValueError, cumsum, x, axis=-4)
f = theano.function([x], cumsum(x))
assert np.allclose(np.cumsum(a), f(a)) # Test axis=None
for axis in range(-len(a.shape), len(a.shape)):
f = theano.function([x], cumsum(x, axis=axis))
assert np.allclose(np.cumsum(a, axis=axis), f(a))
def test_infer_shape(self):
x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(config.floatX)
# Test axis=None
self._compile_and_check([x],
[self.op(x)],
[a],
self.op_class)
for axis in range(-len(a.shape), len(a.shape)):
self._compile_and_check([x],
[cumsum(x, axis=axis)],
[a],
self.op_class)
def test_grad(self):
a = np.random.random((3, 5, 2)).astype(config.floatX)
utt.verify_grad(self.op, [a]) # Test axis=None
for axis in range(-len(a.shape), len(a.shape)):
utt.verify_grad(self.op_class(axis=axis), [a], eps=4e-4)
class TestCumprodOp(utt.InferShapeTester):
def setUp(self):
super(TestCumprodOp, self).setUp()
self.op_class = CumprodOp
self.op = CumprodOp()
def test_CumprodOp(self):
x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(config.floatX)
# Test axis out of bounds
self.assertRaises(ValueError, cumprod, x, axis=3)
self.assertRaises(ValueError, cumprod, x, axis=-4)
f = theano.function([x], cumprod(x))
assert np.allclose(np.cumprod(a), f(a)) # Test axis=None
f = theano.function([x], [cumsum(x), cumprod(x)])
s, p = f(a)
assert np.allclose(np.cumsum(a), s) # Test axis=None
assert np.allclose(np.cumprod(a), p) # Test axis=None
for axis in range(-len(a.shape), len(a.shape)):
f = theano.function([x], cumprod(x, axis=axis))
assert np.allclose(np.cumprod(a, axis=axis), f(a))
f = theano.function([x], [cumsum(x, axis=axis), cumprod(x, axis=axis)])
s, p = f(a)
assert np.allclose(np.cumsum(a, axis=axis), s)
assert np.allclose(np.cumprod(a, axis=axis), p)
def test_infer_shape(self):
x = T.tensor3('x')
......@@ -202,17 +162,19 @@ class TestCumprodOp(utt.InferShapeTester):
for axis in range(-len(a.shape), len(a.shape)):
self._compile_and_check([x],
[cumprod(x, axis=axis)],
[cumsum(x, axis=axis)],
[a],
self.op_class)
def test_grad(self):
a = np.random.random((3, 5, 2)).astype(config.floatX)
utt.verify_grad(self.op, [a]) # Test axis=None
utt.verify_grad(self.op_class(mode='add'), [a]) # Test axis=None
utt.verify_grad(self.op_class(mode='mul'), [a]) # Test axis=None
for axis in range(-len(a.shape), len(a.shape)):
utt.verify_grad(self.op_class(axis=axis), [a])
utt.verify_grad(self.op_class(axis=axis, mode='add'), [a], eps=4e-4)
utt.verify_grad(self.op_class(axis=axis, mode='mul'), [a], eps=4e-4)
class TestBinCount(utt.InferShapeTester):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论