提交 ed7759fb authored 作者: Chiheb Trabelsi's avatar Chiheb Trabelsi

basic_ops.py has been modified in order to respect the flake8 style

上级 200babca
......@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
import copy
import logging
import sys
import warnings
import numpy
from six import iteritems
from six.moves import StringIO, xrange
......@@ -12,6 +12,9 @@ from theano import gof, Type, Apply
from theano import tensor, scalar, config
from theano.gradient import grad_undefined
from theano.scalar import Scalar
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.elemwise import NaiveAlgo
scal = scalar # somewhere scalar gets reassigned to be a function
......@@ -24,10 +27,6 @@ try:
except ImportError:
pass
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.elemwise import NaiveAlgo
_logger_name = 'theano.sandbox.cuda.basic_ops'
_logger = logging.getLogger(_logger_name)
......@@ -79,7 +78,7 @@ class HostFromGpu(GpuOp):
"CudaNdarrayType. Got %s with type %s" % (x,
x.type))
return Apply(self, [x], [tensor.TensorType(dtype=x.dtype,
broadcastable=x.broadcastable)()])
broadcastable=x.broadcastable)()])
def perform(self, node, inp, out):
x, = inp
......@@ -535,10 +534,10 @@ class GpuCAReduce(GpuOp):
Parameters
----------
pre_scalar_op
pre_scalar_op
If present, must be a scalar op with only 1 input.
We will execute it on the input value before reduction.
Notes
-----
This Op is a work in progress.
......@@ -596,10 +595,8 @@ class GpuCAReduce(GpuOp):
if self.pre_scalar_op:
pre = "pre=%s,red=" % str(self.pre_scalar_op)
return "GpuCAReduce{%s%s}{%s}" % (
pre,
str(self.scalar_op),
','.join(str(i) for i in self.reduce_mask)
)
pre, str(self.scalar_op),
','.join(str(i) for i in self.reduce_mask))
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -775,15 +772,18 @@ class GpuCAReduce(GpuOp):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims.
print('if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals(), file=sio)
print('if(CudaNdarray_is_c_contiguous( %(x)s)){' % locals(),
file=sio)
self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print("}else{", file=sio)
getattr(self, 'c_code_reduce_%s'%(''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
print("}", file=sio)
else:
getattr(self, 'c_code_reduce_%s'%(''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
# \end bracket the reduction ...
print("""
......@@ -976,7 +976,7 @@ class GpuCAReduce(GpuOp):
assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum))
if self.pre_scalar_op:
#dtype = node.inputs[0].dtype
# dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)()
......@@ -1275,7 +1275,7 @@ class GpuCAReduce(GpuOp):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
"""
Parameters
----------
N : int
......@@ -1834,12 +1834,15 @@ class GpuCAReduce(GpuOp):
version = [15] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
Apply(self.scalar_op,
[Scalar(
dtype=input.type.dtype)() for input in node.inputs],
[Scalar(
dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version())
for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
version.extend(
Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version):
return tuple(version)
else:
......@@ -1946,10 +1949,11 @@ class GpuCAReduce(GpuOp):
%(reducebuf)s
}
""" % locals(), file=sio)
#01, 011, 0111
# 01, 011, 0111
if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]):
all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]):
# this kernel uses one block for each row.
# threads per block for each element per row.
......@@ -2117,10 +2121,10 @@ class GpuCAReduce(GpuOp):
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
init = self._k_init(node, nodename)
decl = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
......@@ -2294,7 +2298,7 @@ class GpuCAReduce(GpuOp):
}
""" % locals(), file=sio)
if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row,
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={})
......@@ -2470,7 +2474,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
if (x.size % ss) != 0:
raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
shp_new = numpy.copy(shp)
shp_new[m1_idx] = x.size/ss
shp_new[m1_idx] = x.size / ss
shp = shp_new
else:
......@@ -2711,7 +2715,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
# c code suppose it is int64
if x.ndim in [1, 2, 3] and ilist_.dtype in [
'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']:
'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']:
ilist_ = tensor.cast(ilist_, 'int64')
bcast = (ilist_.broadcastable[0],) + x_.broadcastable[1:]
......@@ -2721,7 +2725,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
def perform(self, node, inp, out_):
# This don't work as CudaNdarray_Subscript() don't support it.
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
# super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp
out, = out_
x_orig = x
......@@ -2733,7 +2737,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
if x.ndim <= 3:
# CudaNdarray.take only supports ndim <= 3
if self.perform_using_take is not None:
assert self.perform_using_take == True, (
assert self.perform_using_take is True, (
"GpuAdvancedSubtensor1 used the fast version")
if idx.dtype != numpy.int64:
if idx.dtype in [numpy.int8, numpy.int16, numpy.int32,
......@@ -2762,7 +2766,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
out[0] = o
else:
if self.perform_using_take is not None:
assert self.perform_using_take == False, (
assert self.perform_using_take is False, (
"GpuAdvancedSubtensor1 didn't use the fast version")
if out_[0][0] is None or out_[0][0].shape != out_shape:
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
......@@ -3006,8 +3010,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
convert_map = {8: tensor.basic._convert_to_int8,
16: tensor.basic._convert_to_int16,
32: tensor.basic._convert_to_int32,
64: tensor.basic._convert_to_int64
}
64: tensor.basic._convert_to_int64}
intwidth = theano.configdefaults.python_int_bitwidth()
ilist_ = convert_map[intwidth](ilist_)
......@@ -3039,8 +3042,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major']
if ((node.inputs[0].ndim != node.inputs[1].ndim) or
(node.inputs[0].ndim != 2) or
(compute_capability < 2)):
(node.inputs[0].ndim != 2) or
(compute_capability < 2)):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
......@@ -3212,7 +3215,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
return Apply(self, [x, y] + rval.inputs[2:], [x.type()])
def do_type_checking(self, node):
"""
"""
Should raise NotImplementedError if c_code does not support
the types involved in this node.
......@@ -3248,7 +3251,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
"""
Parameters
----------
----------
x : str
A string identifying an array to be viewed.
view_ndim : str
......@@ -3354,7 +3357,6 @@ class GpuFlatten(gof.HideC, tensor.Flatten, GpuOp):
return Apply(self, [x], [out_type()])
def gpu_flatten(x, outdim=1):
"""
Implement flatten on the gpu.
......@@ -3378,10 +3380,10 @@ def gpu_flatten(x, outdim=1):
"""
x = as_cuda_ndarray_variable(x)
if outdim > 1:
dims = tuple(x.shape[:outdim-1])+(-1,)
dims = tuple(x.shape[:outdim - 1]) + (-1, )
else:
dims = (-1,)
return GpuReshape(outdim)(x, dims)
dims = (-1, )
return GpuReshape(outdim)(x, dims)
class GpuShape(tensor.Shape, GpuOp):
......@@ -3408,12 +3410,11 @@ class GpuJoin(tensor.Join, GpuOp):
as_tensor_variable_args = [as_cuda_ndarray_variable(x)
for x in tensors]
output_maker = \
lambda bcast: CudaNdarrayType(broadcastable=bcast)()
def output_maker(bcast):
return(CudaNdarrayType(broadcastable=bcast)())
return tensor.Join._make_node_internal(self,
axis, tensors,
as_tensor_variable_args, output_maker)
return tensor.Join._make_node_internal(
self, axis, tensors, as_tensor_variable_args, output_maker)
def perform(self, node, axis_and_tensors, out_):
out, = out_
......@@ -3464,8 +3465,8 @@ class GpuJoin(tensor.Join, GpuOp):
# except for 'axis'
def construct_slices(curlen):
slices = [slice(None, None, None) for i in \
xrange(len(template_shape))]
slices = [slice(None, None, None) for i in
xrange(len(template_shape))]
slices[axis] = slice(curpos, curpos + curlen, None)
return tuple(slices)
......@@ -3829,23 +3830,22 @@ class GpuAlloc(GpuAllocEmpty):
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif ( # The following ops work inplace of their input id 0.
client[1] == 0 and
isinstance(client[0].op, (
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer,
))):
# Else if the following ops work inplace of their input id 0.
elif(client[1] == 0 and
isinstance(client[0].op, (
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer,))):
return False
# If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do.
......@@ -3859,7 +3859,7 @@ gpu_alloc = GpuAlloc()
class CopyOnNegativeStrides(GpuOp):
"""
Checks if the input has contains negative strides.
If it does, returns a c contiguous copy.
"""
......@@ -4017,7 +4017,7 @@ def scalar(name=None, dtype=None):
Parameters
----------
dtype
dtype
Numeric type (None means to use theano.config.floatX).
name : str
A name to attach to this variable.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论