提交 e4ae792a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Flake8 for elemwise.py

上级 2e3d841e
......@@ -4,7 +4,6 @@ import os
from theano.compat import izip
import numpy
import theano
from theano import Apply, scalar, config
from theano import scalar as scal
from six.moves import StringIO, xrange
......@@ -94,7 +93,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
try:
support_code = self.scalar_op.c_support_code()
if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
support_code.strip() != ""):
support_code.strip() != ""):
# The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code)
except MethodNotDefined:
......@@ -108,7 +107,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(node.outputs) if not n in self.inplace_pattern]
enumerate(node.outputs) if n not in self.inplace_pattern]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
......@@ -132,7 +131,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else:
scal_out.append(arg.name + '[i]')
kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
scal_in, scal_out,
dict(fail='return;'))
......@@ -169,9 +168,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float16", "ga_half"),
("npy_float32", "ga_float"),
("npy_float64", "ga_double"),
]:
]:
kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
......@@ -399,7 +398,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = dtype_to_ctype(var.dtype)
......@@ -417,7 +416,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % dict(kname=kname,fail=fail)
""" % dict(kname=kname, fail=fail)
if config.gpuarray.sync:
code += """
err = GpuArray_sync(&%(z)s->ga);
......@@ -495,7 +494,7 @@ class GpuDimShuffle(HideC, DimShuffle):
res = input
res = res.transpose(self.shuffle+self.drop)
res = res.transpose(self.shuffle + self.drop)
shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment:
......@@ -533,7 +532,7 @@ class GpuDimShuffle(HideC, DimShuffle):
Py_DECREF(tmp);
return res;
}
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle + self.drop)),
name=name, nd_out=len(self.new_order),
copy_shape=copy_shape(len(self.new_order)))
......@@ -581,7 +580,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
pre_scalar_op
If present, must be a scalar op with only 1 input. We will execute it
on the input value before reduction.
Examples
--------
When scalar_op is a theano.scalar.basic.Add instance:
......@@ -671,8 +670,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if self.pre_scalar_op:
# Currently we only tested pre_scalar_op that don't cause
# upcast.
d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
assert d1.dtype == ret.outputs[0].dtype
assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
if self.reduce_mask is None:
if self.axis is None:
......@@ -687,8 +684,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if (x.type.ndim != len(self.reduce_mask)):
raise TypeError("x must have rank %i" % len(self.reduce_mask))
if ("complex" in x.dtype or
"complex" in ret.outputs[0].dtype or
"complex" in self._acc_dtype(x.dtype)):
"complex" in ret.outputs[0].dtype or
"complex" in self._acc_dtype(x.dtype)):
raise NotImplementedError("We don't support complex in gpu reduction")
return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
ret.outputs[0].type.broadcastable)()])
......@@ -863,14 +860,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims.
print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals(), file=sio)
print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
file=sio)
self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print("}else{", file=sio)
getattr(self, 'c_code_reduce_%s'%(''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
getattr(self, 'c_code_reduce_%s' %
(''.join(str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
print("}", file=sio)
else:
getattr(self, 'c_code_reduce_%s'%(''.join(
getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
# \end bracket the reduction ...
......@@ -1094,8 +1093,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
else:
assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum))
if self.pre_scalar_op: # TODO, multi_dtype!
#dtype = node.inputs[0].dtype
if self.pre_scalar_op: # TODO: multiple dtypes
# dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)()
......@@ -1171,7 +1170,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Parameters
----------
node, name, sub
node, name, sub
These should be passed through from the original call to c_code.
"""
......@@ -1411,7 +1410,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
"""
Parameters
----------
N
......@@ -1946,9 +1945,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
version = [16] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version())
for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
......@@ -1962,7 +1958,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype)
flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
......@@ -2106,10 +2102,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
#01, 011, 0111
# 01, 011, 0111
if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]):
all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]):
# this kernel uses one block for each row.
# threads per block for each element per row.
......@@ -2303,10 +2299,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
init = self._k_init(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
......@@ -2515,7 +2511,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row,
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={})
......@@ -2625,7 +2621,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename
k_var = "kernel_reduce_1011_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(
......@@ -2712,7 +2708,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
# cache the kernel object
self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
compute_map, no_recycling)
compute_map, no_recycling)
def get_kernel_cache(self, node):
attr = '@cache_reduction_k'
......@@ -2753,7 +2749,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='k_reduk_'+name)]
objvar='k_reduk_' + name)]
def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
......@@ -2768,8 +2764,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d)
GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
......@@ -2816,8 +2812,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
}
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else:
code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) {
......@@ -2828,8 +2824,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
}
""" % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
""" % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype:
code += """
......@@ -2837,12 +2833,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None);
if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
""" % dict(output=output, fail=sub['fail'],
acc_type=dtype_to_typecode(acc_dtype))
else:
code += """
tmp = %(output)s;
Py_INCREF(tmp);
""" % dict(output=output)
""" % dict(output=output)
# We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the
......@@ -2850,7 +2847,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code += """
args[0] = &n;
args[1] = tmp->ga.data;
""" % dict(output=output)
""" % dict(output=output)
p = 2
for i in range(node.inputs[0].ndim):
......@@ -2858,7 +2855,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
args[%(p)s] = &proxy_dim[%(i)s];
n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input)
""" % dict(i=i, p=p, input=input)
p += 1
if not redux[i]:
code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
......@@ -2867,14 +2864,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
args[%(p)s] = %(input)s->ga.data;
proxy_off = %(input)s->ga.offset;
args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input)
""" % dict(p=p, input=input)
p += 2
for i in range(node.inputs[0].ndim):
code += """
proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input)
""" % dict(p=p, i=i, input=input)
p += 1
code += """
......@@ -2911,9 +2908,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
}
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
""" % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
......@@ -2942,8 +2939,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
redux = self.redux
if any(redux):
output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
dtype=node.outputs[0].type.dtype)
output[0] = self.get_kernel_cache(node)(input).astype(
copy=False, dtype=node.outputs[0].type.dtype)
else:
output[0] = pygpu.gpuarray.array(input, copy=True,
dtype=node.outputs[0].type.dtype)
......
......@@ -157,7 +157,6 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py",
"sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/kernel_codegen.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论