提交 e4ae792a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Flake8 for elemwise.py

上级 2e3d841e
...@@ -4,7 +4,6 @@ import os ...@@ -4,7 +4,6 @@ import os
from theano.compat import izip from theano.compat import izip
import numpy import numpy
import theano
from theano import Apply, scalar, config from theano import Apply, scalar, config
from theano import scalar as scal from theano import scalar as scal
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
...@@ -94,7 +93,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -94,7 +93,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
try: try:
support_code = self.scalar_op.c_support_code() support_code = self.scalar_op.c_support_code()
if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
support_code.strip() != ""): support_code.strip() != ""):
# The macro is fine, the C++ struct is not. # The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code) raise SupportCodeError(support_code)
except MethodNotDefined: except MethodNotDefined:
...@@ -108,7 +107,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -108,7 +107,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs] scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
outs = [make_argument(o, 'o%d' % (n,)) for n, o in outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(node.outputs) if not n in self.inplace_pattern] enumerate(node.outputs) if n not in self.inplace_pattern]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs] scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
...@@ -132,7 +131,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -132,7 +131,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else: else:
scal_out.append(arg.name + '[i]') scal_out.append(arg.name + '[i]')
kop = self.scalar_op.c_code(fake_node, nodename+'_scalar', kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
scal_in, scal_out, scal_in, scal_out,
dict(fail='return;')) dict(fail='return;'))
...@@ -169,9 +168,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -169,9 +168,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float16", "ga_half"), ("npy_float16", "ga_half"),
("npy_float32", "ga_float"), ("npy_float32", "ga_float"),
("npy_float64", "ga_double"), ("npy_float64", "ga_double"),
]: ]:
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
def c_header_dirs(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
...@@ -399,7 +398,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -399,7 +398,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0], param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i)) i=i))
for n, (name, var) in enumerate(zip(inputs + outputs, for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)): node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern: if (n - len(inputs)) in self.inplace_pattern:
continue continue
dtype = dtype_to_ctype(var.dtype) dtype = dtype_to_ctype(var.dtype)
...@@ -417,7 +416,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -417,7 +416,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
GpuKernel_error(&%(kname)s, err)); GpuKernel_error(&%(kname)s, err));
%(fail)s; %(fail)s;
} }
""" % dict(kname=kname,fail=fail) """ % dict(kname=kname, fail=fail)
if config.gpuarray.sync: if config.gpuarray.sync:
code += """ code += """
err = GpuArray_sync(&%(z)s->ga); err = GpuArray_sync(&%(z)s->ga);
...@@ -495,7 +494,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -495,7 +494,7 @@ class GpuDimShuffle(HideC, DimShuffle):
res = input res = input
res = res.transpose(self.shuffle+self.drop) res = res.transpose(self.shuffle + self.drop)
shape = list(res.shape[:len(self.shuffle)]) shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment: for augm in self.augment:
...@@ -533,7 +532,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -533,7 +532,7 @@ class GpuDimShuffle(HideC, DimShuffle):
Py_DECREF(tmp); Py_DECREF(tmp);
return res; return res;
} }
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)), """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle + self.drop)),
name=name, nd_out=len(self.new_order), name=name, nd_out=len(self.new_order),
copy_shape=copy_shape(len(self.new_order))) copy_shape=copy_shape(len(self.new_order)))
...@@ -581,7 +580,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -581,7 +580,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
pre_scalar_op pre_scalar_op
If present, must be a scalar op with only 1 input. We will execute it If present, must be a scalar op with only 1 input. We will execute it
on the input value before reduction. on the input value before reduction.
Examples Examples
-------- --------
When scalar_op is a theano.scalar.basic.Add instance: When scalar_op is a theano.scalar.basic.Add instance:
...@@ -671,8 +670,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -671,8 +670,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if self.pre_scalar_op: if self.pre_scalar_op:
# Currently we only tested pre_scalar_op that don't cause # Currently we only tested pre_scalar_op that don't cause
# upcast. # upcast.
d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
assert d1.dtype == ret.outputs[0].dtype
assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
if self.reduce_mask is None: if self.reduce_mask is None:
if self.axis is None: if self.axis is None:
...@@ -687,8 +684,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -687,8 +684,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if (x.type.ndim != len(self.reduce_mask)): if (x.type.ndim != len(self.reduce_mask)):
raise TypeError("x must have rank %i" % len(self.reduce_mask)) raise TypeError("x must have rank %i" % len(self.reduce_mask))
if ("complex" in x.dtype or if ("complex" in x.dtype or
"complex" in ret.outputs[0].dtype or "complex" in ret.outputs[0].dtype or
"complex" in self._acc_dtype(x.dtype)): "complex" in self._acc_dtype(x.dtype)):
raise NotImplementedError("We don't support complex in gpu reduction") raise NotImplementedError("We don't support complex in gpu reduction")
return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype, return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
ret.outputs[0].type.broadcastable)()]) ret.outputs[0].type.broadcastable)()])
...@@ -863,14 +860,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -863,14 +860,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code. # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle # TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims. # TODO: if only some dims are ccontiguous, call version with less dims.
print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals(), file=sio) print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
file=sio)
self.c_code_reduce_ccontig(sio, node, name, x, z, fail) self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print("}else{", file=sio) print("}else{", file=sio)
getattr(self, 'c_code_reduce_%s'%(''.join( getattr(self, 'c_code_reduce_%s' %
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) (''.join(str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
print("}", file=sio) print("}", file=sio)
else: else:
getattr(self, 'c_code_reduce_%s'%(''.join( getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
# \end bracket the reduction ... # \end bracket the reduction ...
...@@ -1094,8 +1093,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1094,8 +1093,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
else: else:
assert isinstance(self.scalar_op, (scal.Maximum, assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum)) scal.Minimum))
if self.pre_scalar_op: # TODO, multi_dtype! if self.pre_scalar_op: # TODO: multiple dtypes
#dtype = node.inputs[0].dtype # dtype = node.inputs[0].dtype
dtype = 'float32' dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)() dummy_var = scal.Scalar(dtype=dtype)()
...@@ -1171,7 +1170,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1171,7 +1170,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Parameters Parameters
---------- ----------
node, name, sub node, name, sub
These should be passed through from the original call to c_code. These should be passed through from the original call to c_code.
""" """
...@@ -1411,7 +1410,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1411,7 +1410,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
""" """
Parameters Parameters
---------- ----------
N N
...@@ -1946,9 +1945,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1946,9 +1945,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
version = [16] # the version corresponding to the c code in this Op version = [16] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version()) version.extend(self.scalar_op.c_code_cache_version())
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
...@@ -1962,7 +1958,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1962,7 +1958,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype = node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype) flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype) in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype) out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype) acc_type = gpuarray.dtype_to_ctype(acc_dtype)
...@@ -2106,10 +2102,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2106,10 +2102,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
] ]
kernels.append(Kernel(code=sio.getvalue(), name=kname, kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var)) params=params, flags=flags, objvar=k_var))
#01, 011, 0111 # 01, 011, 0111
if (0 == self.reduce_mask[0] and if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]): nd_in in[2, 3, 4]):
# this kernel uses one block for each row. # this kernel uses one block for each row.
# threads per block for each element per row. # threads per block for each element per row.
...@@ -2303,10 +2299,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2303,10 +2299,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# this kernel uses one block for multiple column(up to 32TODO), # this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column. # threads per block for each element per column.
# thread.x = dim 2 contiguous # thread.x = dim 2 contiguous
# thread.y = dim 1 # thread.y = dim 1
# block.x = dim 0 # block.x = dim 0
# block.y = dim 1 rest # block.y = dim 1 rest
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner") decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
...@@ -2515,7 +2511,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2515,7 +2511,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kernels.append(Kernel(code=sio.getvalue(), name=kname, kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var)) params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1, 1): if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
...@@ -2625,7 +2621,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2625,7 +2621,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011" kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename k_var = "kernel_reduce_1011_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""
KERNEL void %(kname)s( KERNEL void %(kname)s(
...@@ -2712,7 +2708,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2712,7 +2708,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
# cache the kernel object # cache the kernel object
self.get_kernel_cache(node) self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(node, storage_map, return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
compute_map, no_recycling) compute_map, no_recycling)
def get_kernel_cache(self, node): def get_kernel_cache(self, node):
attr = '@cache_reduction_k' attr = '@cache_reduction_k'
...@@ -2753,7 +2749,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2753,7 +2749,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
flags=Kernel.get_flags(node.inputs[0].type.dtype, flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype, acc_dtype,
node.outputs[0].type.dtype), node.outputs[0].type.dtype),
objvar='k_reduk_'+name)] objvar='k_reduk_' + name)]
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
...@@ -2768,8 +2764,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2768,8 +2764,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d) if (%(sync)d)
GpuArray_sync(&%(out)s->ga); GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'], """ % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync)) sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size, _, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim) node.inputs[0].ndim)
...@@ -2816,8 +2812,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2816,8 +2812,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'], """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else: else:
code += """ code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) { if (%(output)s == NULL || %(output)s->ga.nd != 0) {
...@@ -2828,8 +2824,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2828,8 +2824,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, fail=sub['fail'], """ % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype: if acc_dtype != node.outputs[0].type.dtype:
code += """ code += """
...@@ -2837,12 +2833,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2837,12 +2833,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)s, GA_C_ORDER, pygpu_default_context(), %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None); Py_None);
if (!tmp) %(fail)s if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype)) """ % dict(output=output, fail=sub['fail'],
acc_type=dtype_to_typecode(acc_dtype))
else: else:
code += """ code += """
tmp = %(output)s; tmp = %(output)s;
Py_INCREF(tmp); Py_INCREF(tmp);
""" % dict(output=output) """ % dict(output=output)
# We need the proxies since we are passing a pointer to the # We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the # data into the call and therefore we need a real copy of the
...@@ -2850,7 +2847,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2850,7 +2847,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code += """ code += """
args[0] = &n; args[0] = &n;
args[1] = tmp->ga.data; args[1] = tmp->ga.data;
""" % dict(output=output) """ % dict(output=output)
p = 2 p = 2
for i in range(node.inputs[0].ndim): for i in range(node.inputs[0].ndim):
...@@ -2858,7 +2855,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2858,7 +2855,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s]; proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
args[%(p)s] = &proxy_dim[%(i)s]; args[%(p)s] = &proxy_dim[%(i)s];
n *= %(input)s->ga.dimensions[%(i)s]; n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input) """ % dict(i=i, p=p, input=input)
p += 1 p += 1
if not redux[i]: if not redux[i]:
code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i) code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
...@@ -2867,14 +2864,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2867,14 +2864,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
args[%(p)s] = %(input)s->ga.data; args[%(p)s] = %(input)s->ga.data;
proxy_off = %(input)s->ga.offset; proxy_off = %(input)s->ga.offset;
args[%(p)s+1] = &proxy_off; args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input) """ % dict(p=p, input=input)
p += 2 p += 2
for i in range(node.inputs[0].ndim): for i in range(node.inputs[0].ndim):
code += """ code += """
proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s]; proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
args[%(p)s] = &proxy_str[%(i)s]; args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input) """ % dict(p=p, i=i, input=input)
p += 1 p += 1
code += """ code += """
...@@ -2911,9 +2908,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2911,9 +2908,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
} }
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync), """ % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code return code
...@@ -2942,8 +2939,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2942,8 +2939,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
redux = self.redux redux = self.redux
if any(redux): if any(redux):
output[0] = self.get_kernel_cache(node)(input).astype(copy=False, output[0] = self.get_kernel_cache(node)(input).astype(
dtype=node.outputs[0].type.dtype) copy=False, dtype=node.outputs[0].type.dtype)
else: else:
output[0] = pygpu.gpuarray.array(input, copy=True, output[0] = pygpu.gpuarray.array(input, copy=True,
dtype=node.outputs[0].type.dtype) dtype=node.outputs[0].type.dtype)
......
...@@ -157,7 +157,6 @@ whitelist_flake8 = [ ...@@ -157,7 +157,6 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py", "sandbox/linalg/ops.py",
"sandbox/linalg/__init__.py", "sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py", "sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py", "sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/kernel_codegen.py", "sandbox/gpuarray/kernel_codegen.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论