提交 1547ecc6 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2800 from abergeron/fp16_merge

Fp16 merge
......@@ -212,6 +212,21 @@ There are less methods to define for an Op than for a Type:
Op *must* have a `context_type` property with the Type to use
for the context variable.
.. attribute:: _f16_ok
(optional) If this attribute is absent or evaluates to `False`,
C code will be disabled for the op if any of its inputs or
outputs contains float16 data. This is added as a check to make
sure we don't compute wrong results since there is no hardware
float16 type so special care must be taken to make sure
operations are done correctly.
If you don't intend to deal with float16 data you can leave
this undefined.
This attribute is internal and may go away at any point during
developpment if a better solution is found.
The ``name`` argument is currently given an invalid value, so steer
away from it. As was the case with Type, ``sub['fail']`` provides
failure code that you *must* use if you want to raise an exception,
......
......@@ -215,6 +215,8 @@ class Shape(gof.Op):
@note: Non-differentiable.
"""
_f16_ok = True
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
......@@ -308,6 +310,8 @@ class Shape_i(gof.Op):
@note: Non-differentiable.
"""
_f16_ok = True
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
......
......@@ -18,12 +18,17 @@ def floatX_convert(s):
return "float32"
elif s == "64":
return "float64"
elif s == "16":
return "float16"
else:
return s
AddConfigVar('floatX',
"Default floating-point precision for python casts",
EnumStr('float64', 'float32', convert=floatX_convert,),
"Default floating-point precision for python casts.\n"
"\n"
"Note: float16 support is experimental, use at your own risk.",
EnumStr('float64', 'float32', 'float16',
convert=floatX_convert,),
)
AddConfigVar('warn_float64',
......@@ -39,7 +44,7 @@ AddConfigVar('cast_policy',
EnumStr('custom', 'numpy+floatX',
# The 'numpy' policy was originally planned to provide a
# smooth transition from numpy. It was meant to behave the
# same asnumpy+floatX, but keeping float64 when numpy
# same as numpy+floatX, but keeping float64 when numpy
# would. However the current implementation of some cast
# mechanisms makes it a bit more complex to add than what
# was expected, so it is currently not available.
......
......@@ -726,9 +726,20 @@ class Op(utils.object2, PureOp, CLinkerOp):
node_output_storage = [storage_map[r] for r in node.outputs]
node_input_compute = [compute_map[r] for r in node.inputs]
node_output_compute = [compute_map[r] for r in node.outputs]
#logger.debug('Compiling node %i of graph' % node_idx)
if self._op_use_c_code:
try:
# float16 get special treatment since running
# unprepared C code will get bad results.
if not getattr(self, '_f16_ok', False):
def is_f16(t):
return getattr(t, 'dtype', '') == 'float16'
if (any(is_f16(i.type) for i in node.inputs) or
any(is_f16(o.type) for o in node.outputs)):
print ("Disabling C code for %s due to unsupported "
"float16" % (self,))
raise NotImplementedError("float16")
e = FunctionGraph(node.inputs, node.outputs)
e_no_recycling = [new_o
......
......@@ -20,6 +20,7 @@ except ImportError:
pass
from .type import GpuArrayType
from .fp16_help import write_w
def as_gpuarray_variable(x):
......@@ -186,11 +187,8 @@ class GpuKernelBase(object):
class HostFromGpu(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
__props__ = ()
_f16_ok = True
def __str__(self):
return 'HostFromGpu(gpuarray)'
......@@ -269,11 +267,8 @@ host_from_gpu = HostFromGpu()
class GpuFromHost(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
__props__ = ()
_f16_ok = True
def __str__(self):
return 'GpuFromHost(gpuarray)'
......@@ -573,18 +568,15 @@ cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc):
__props__ = ('memset_0',)
_f16_ok = True
def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster.
"""
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self):
# Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
......@@ -728,25 +720,17 @@ class GpuContiguous(Op):
Always return a c contiguous output. Copy the input only if it is
not already c contiguous.
"""
__props__ = ()
view_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
_f16_ok = True
def grad(self, inputs, dout):
x, = inputs
dout, = dout
dout = as_gpuarray_variable(dout)
return [dout]
def __str__(self):
return self.__class__.__name__
def make_node(self, input):
input = as_gpuarray_variable(input)
return Apply(self, [input], [input.type()])
......@@ -794,6 +778,8 @@ class GpuReshape(HideC, tensor.Reshape):
"""
Implement Reshape on the gpu.
"""
_f16_ok = True
# __hash__, __eq__, __str__ come from tensor.Reshape
def make_node(self, x, shp):
x = as_gpuarray_variable(x)
......@@ -831,6 +817,8 @@ class GpuReshape(HideC, tensor.Reshape):
class GpuJoin(HideC, Join):
_f16_ok = True
def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors)
......@@ -888,6 +876,9 @@ class GpuSplit(HideC, Split):
class GpuEye(GpuKernelBase, Op):
__props__ = ('dtype',)
_f16_ok = True
def __init__(self, dtype=None):
if dtype is None:
dtype = config.floatX
......@@ -915,20 +906,15 @@ class GpuEye(GpuKernelBase, Op):
return [grad_undefined(self, i, inp[i])
for i in xrange(3)]
def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def gpu_kernels(self, node, name):
code = """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1;
a[i*m + i] = %(write_a)s(1);
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name, write_a=write_w(self.dtype))
return [Kernel(
code=code, name="k",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
......
def work_dtype(dtype):
if dtype == 'float16':
return 'float32'
else:
return dtype
def load_w(dtype):
if dtype == 'float16':
return '__half2float'
else:
return ''
def write_w(dtype):
if dtype == 'float16':
return '__float2half_rn'
else:
return ''
......@@ -16,6 +16,7 @@ from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
......@@ -24,15 +25,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
"""
nin = 3
nout = 3
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
__props__ = ()
_f16_ok = True
def make_node(self, x, b, y_idx):
# N.B. won't work when we don't cast y_idx to float anymore
......@@ -52,6 +46,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
work_x = work_dtype(dtype_x)
work_b = work_dtype(dtype_b)
load_x = load_w(dtype_x)
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
return """
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
const npy_%(dtype_x)s* x_data, int xs0, int xs1,
......@@ -67,12 +67,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row;
npy_%(dtype_x)s sum = 0.0;
npy_%(work_x)s sum = 0.0;
int row_max_j = 0;
npy_%(dtype_x)s row_max = x[0] + b[0];
npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
for (int j = 1; j < N; ++j)
{
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
......@@ -80,27 +81,30 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp
for (int j = 0; j < N; ++j)
{
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
npy_%(work_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
}
npy_%(dtype_x)s sum_inv = 1.0 / sum;
npy_%(work_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
sm[j * sms1] *= sum_inv;
npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]);
__tmp *= sum_inv;
sm[j * sms1] = %(write_x)s(__tmp);
}
if ((y_idx >= N) || (y_idx < 0))
{
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
}
else
{
nll_data[row*nlls0] = - x[y_idx*xs1]
- b[y_idx*bs0]
nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
- %(load_b)s(b[y_idx*bs0])
+ row_max
+ log(sum);
+ log(sum));
}
am_data[row*ams0] = row_max_j;
}
......@@ -259,8 +263,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
return sio.getvalue()
def c_code_cache_version(self):
# return ()
return (5,)
return (6,)
def c_compiler(self):
return NVCC_compiler
......@@ -272,21 +275,13 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho
class GpuCrossentropySoftmax1HotWithBiasDx(Op):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
Gradient wrt x of the CrossentropySoftmax1Hot Op
"""
nin = 3
nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op"""
def __init__(self, **kwargs):
Op.__init__(self, **kwargs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
__props__ = ()
_f16_ok = True
def make_node(self, dnll, sm, y_idx):
dnll = as_gpuarray_variable(dnll)
......@@ -295,8 +290,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
# return ()
return (8,)
return (9,)
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
......@@ -421,6 +415,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
work_dnll = work_dtype(dtype_dnll)
load_dnll = load_w(dtype_dnll)
load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx)
return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K,
......@@ -431,7 +429,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
{
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
......@@ -439,16 +437,15 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
if (y_i == j)
{
dx[i * dx_s0 + j * dx_s1] =
dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
%(write_dx)s(dnll_i *
(%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
}
else
{
dx[i * dx_s0 + j * dx_s1] =
dnll_i * sm[i * sm_s0 + j * sm_s1];
%(write_dx)s(dnll_i *
%(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
}
//dx[i * dx_s0 + j * dx_s1] =
// dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*dx_s0+j*dx_s1] = 0;
}
}
}
......@@ -466,14 +463,7 @@ class GpuSoftmax (Op):
"""
Implement Softmax on the gpu.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
__props__ = ()
def make_node(self, x):
x = as_gpuarray_variable(x)
......@@ -484,14 +474,14 @@ class GpuSoftmax (Op):
def c_code_cache_version(self):
return (12,) + inline_softmax.code_version
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self):
return ['setup_ext_cuda();']
......@@ -527,7 +517,7 @@ class GpuSoftmax (Op):
pygpu_default_context(), Py_None);
if (!%(z)s) {
%(fail)s
}
}
}
{
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
......@@ -658,15 +648,7 @@ class GpuSoftmaxWithBias (Op):
"""
nin = 2
nout = 1
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
__props__ = ()
def make_node(self, x, b):
x = as_gpuarray_variable(x)
......@@ -675,20 +657,20 @@ class GpuSoftmaxWithBias (Op):
def infer_shape(self, node, shape):
return [shape[0]]
def c_code_cache_version(self):
return (11,) + inline_softmax.code_version
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self):
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
......@@ -738,7 +720,7 @@ class GpuSoftmaxWithBias (Op):
pygpu_default_context(), Py_None);
if (!%(z)s) {
%(fail)s
}
}
}
{
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024));
......
......@@ -10,9 +10,7 @@ except ImportError:
from theano import tensor, scalar, gof
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB,
Optimizer, toolbox,
InconsistencyError, EquilibriumOptimizer)
SequenceDB, Optimizer, toolbox)
from theano.scan_module import scan_utils, scan_op, scan_opt
......@@ -28,8 +26,7 @@ from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax)
from .elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda,
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1,
......@@ -134,7 +131,7 @@ class InputToGpuOptimizer(Optimizer):
new_input = host_from_gpu(gpu_from_host(input))
fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError as e:
except TypeError:
# This could fail if the inputs are not TensorTypes
pass
......@@ -199,8 +196,8 @@ def local_gpualloc_memset_0(node):
if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0]
if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
......@@ -253,10 +250,11 @@ def local_gpuflatten(node):
@op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node):
op = node.op
scal_op = op.scalar_op
name = op.name
if name:
name = 'Gpu'+name
res = GpuElemwise(op.scalar_op, name=name,
res = GpuElemwise(scal_op, name=name,
inplace_pattern=copy.copy(op.inplace_pattern),
nfunc_spec=op.nfunc_spec)
return res
......@@ -343,7 +341,7 @@ def local_gpua_join(node):
def local_gpuajoin_1(node):
# join of a single element
if (isinstance(node.op, GpuJoin) and
len(node.inputs) == 2):
len(node.inputs) == 2):
return [node.inputs[1]]
......@@ -360,13 +358,13 @@ def local_gpua_subtensor(node):
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0]
if (gpu_x.owner and
isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner):
isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1:
if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
for v in n.inputs + n.outputs])
for n, _ in node.outputs[0].clients]):
for n, _ in node.outputs[0].clients]):
return
else:
return [host_from_gpu(gpu_from_host(node.outputs[0]))]
......@@ -391,7 +389,6 @@ def local_gpua_advanced_incsubtensor(node):
return None
x, y = node.inputs[0:2]
coords = node.inputs[2:]
set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
......@@ -429,7 +426,7 @@ def local_gpua_careduce(node):
# We need to have the make node called, otherwise the mask can
# be None
if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])):
gvar.owner.op.supports_c_code([gpu_from_host(x)])):
return greduce
else:
# Try to make a simpler pattern based on reshaping
......@@ -573,7 +570,7 @@ def local_gpu_conv(node):
version=op.version,
verbose=op.verbose,
imshp=op.imshp,
)
)
if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]:
......@@ -633,15 +630,14 @@ def local_gpu_conv(node):
def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise"""
if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
):
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op,
......
......@@ -21,6 +21,8 @@ from .comp import NVCC_compiler
class GpuSubtensor(HideC, Subtensor):
_f16_ok = True
def make_node(self, x, *inputs):
rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
......
......@@ -436,7 +436,7 @@ def test_gpueye():
assert any([isinstance(node.op, GpuEye)
for node in f.maker.fgraph.toposort()])
for dtype in ['float32', 'int32']:
for dtype in ['float32', 'int32', 'float16']:
yield check, dtype, 3
# M != N, k = 0
yield check, dtype, 3, 5
......
......@@ -185,6 +185,7 @@ class GpuArrayType(Type):
# complex64, etc.
try:
return {
'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
......@@ -309,7 +310,11 @@ class GpuArrayConstant(_operators, Constant):
def __str__(self):
if self.name is not None:
return self.name
return "GpuArrayConstant{%s}" % numpy.asarray(self.data)
try:
np_data = numpy.asarray(self.data)
except gpuarray.GpuArrayException:
np_data = self.data
return "GpuArrayConstant{%s}" % np_data
GpuArrayType.Constant = GpuArrayConstant
......
......@@ -613,60 +613,48 @@ def test_binomial():
# test empty size (scalar)
((), (), [], []),
]:
yield (t_binomial, mean, size, const_size, var_input, input,
steps, rtol)
# print ''
# print 'ON CPU with size=(%s) and mean(%d):' % (str(size), mean)
R = MRG_RandomStreams(234, use_cuda=False)
# Note: we specify `nstreams` to avoid a warning.
u = R.binomial(size=size, p=mean,
nstreams=rng_mrg.guess_n_streams(size, warn=False))
f = theano.function(var_input, u, mode=mode)
# theano.printing.debugprint(f)
out = f(*input)
# print 'random?[:10]\n', out[0, 0:10]
# print 'random?[-1,-10:]\n', out[-1, -10:]
# Increase the number of steps if sizes implies only a few samples
if numpy.prod(const_size) < 10:
steps_ = steps * 100
else:
steps_ = steps
basictest(f, steps_, const_size, prefix='mrg cpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU with size=(%s) and mean(%d):' % (str(size), mean)
R = MRG_RandomStreams(234, use_cuda=True)
u = R.binomial(size=size, p=mean, dtype='float32',
nstreams=rng_mrg.guess_n_streams(size,
warn=False))
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
# theano.printing.debugprint(f)
gpu_out = numpy.asarray(f(*input))
# print 'random?[:10]\n', gpu_out[0, 0:10]
# print 'random?[-1,-10:]\n', gpu_out[-1, -10:]
basictest(f, steps_, const_size, prefix='mrg gpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
numpy.testing.assert_array_almost_equal(out, gpu_out,
decimal=6)
# print ''
# print 'ON CPU w NUMPY with size=(%s) and mean(%d):' % (str(size),
# mean)
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.binomial(size=size, p=mean)
ff = theano.function(var_input, uu, mode=mode)
# It's not our problem if numpy generates 0 or 1
basictest(ff, steps_, const_size, prefix='numpy', allow_01=True,
inputs=input, target_avg=mean, mean_rtol=rtol)
def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
R = MRG_RandomStreams(234, use_cuda=False)
u = R.binomial(size=size, p=mean)
f = theano.function(var_input, u, mode=mode)
out = f(*input)
# Increase the number of steps if sizes implies only a few samples
if numpy.prod(const_size) < 10:
steps_ = steps * 100
else:
steps_ = steps
basictest(f, steps_, const_size, prefix='mrg cpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
if mode != 'FAST_COMPILE' and cuda_available:
R = MRG_RandomStreams(234, use_cuda=True)
u = R.binomial(size=size, p=mean, dtype='float32')
# well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu)
gpu_out = numpy.asarray(f(*input))
basictest(f, steps_, const_size, prefix='mrg gpu',
inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol)
numpy.testing.assert_array_almost_equal(out, gpu_out,
decimal=6)
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.binomial(size=size, p=mean)
ff = theano.function(var_input, uu, mode=mode)
# It's not our problem if numpy generates 0 or 1
basictest(ff, steps_, const_size, prefix='numpy', allow_01=True,
inputs=input, target_avg=mean, mean_rtol=rtol)
@attr('slow')
......
......@@ -50,26 +50,34 @@ class IntegerDivisionError(Exception):
def upcast(dtype, *dtypes):
# Should we try to keep float32 instead of float64? This is used so that
# for instance mixing int64 with float32 yields float32 instead of float64.
# Note that we store this boolean as a one-element list so that it can be
# modified within `make_array`.
# This tries to keep data in floatX or lower precision, unless we
# explicitely request a higher precision datatype.
keep_float32 = [(config.cast_policy == 'numpy+floatX' and
config.floatX == 'float32')]
keep_float16 = [(config.cast_policy == 'numpy+floatX' and
config.floatX == 'float16')]
def make_array(dt):
if dt == 'float64':
# There is an explicit float64 dtype: we cannot keep float32.
keep_float32[0] = False
keep_float16[0] = False
if dt == 'float32':
keep_float16[0] = False
return numpy.zeros((), dtype=dt)
z = make_array(dtype)
for dt in dtypes:
z = z + make_array(dt=dt)
rval = str(z.dtype)
if rval == 'float64' and keep_float32[0]:
return 'float32'
else:
return rval
if rval == 'float64':
if keep_float16[0]:
return 'float16'
if keep_float32[0]:
return 'float32'
elif rval == 'float32':
if keep_float16[0]:
return 'float16'
return rval
def get_scalar_type(dtype):
......@@ -232,6 +240,7 @@ class Scalar(Type):
print(dtype, np.zeros(1, dtype=dtype).dtype.num)
"""
return { # dtype: (py_type, c_type, cls_name)
'float16': (numpy.float16, 'npy_float16', 'Float16'),
'float32': (numpy.float32, 'npy_float32', 'Float32'),
'float64': (numpy.float64, 'npy_float64', 'Float64'),
'complex128': (numpy.complex128, 'theano_complex128',
......@@ -501,6 +510,7 @@ uint8 = get_scalar_type('uint8')
uint16 = get_scalar_type('uint16')
uint32 = get_scalar_type('uint32')
uint64 = get_scalar_type('uint64')
float16 = get_scalar_type('float16')
float32 = get_scalar_type('float32')
float64 = get_scalar_type('float64')
complex64 = get_scalar_type('complex64')
......@@ -508,7 +518,7 @@ complex128 = get_scalar_type('complex128')
int_types = int8, int16, int32, int64
uint_types = uint8, uint16, uint32, uint64
float_types = float32, float64
float_types = float16, float32, float64
complex_types = complex64, complex128
discrete_types = int_types + uint_types
......@@ -1995,6 +2005,7 @@ convert_to_uint8 = Cast(uint8, name='convert_to_uint8')
convert_to_uint16 = Cast(uint16, name='convert_to_uint16')
convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
convert_to_float16 = Cast(float16, name='convert_to_float16')
convert_to_float32 = Cast(float32, name='convert_to_float32')
convert_to_float64 = Cast(float64, name='convert_to_float64')
convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
......@@ -2009,6 +2020,7 @@ _cast_mapping = {
'uint16': convert_to_uint16,
'uint32': convert_to_uint32,
'uint64': convert_to_uint64,
'float16': convert_to_float16,
'float32': convert_to_float32,
'float64': convert_to_float64,
'complex64': convert_to_complex64,
......@@ -3286,14 +3298,20 @@ class Composite(ScalarOp):
+ zip(self.fgraph.outputs,
["%%(o%i)s" % i for i in xrange(len(self.fgraph.outputs))]))
for orphan in self.fgraph.variables: # fgraph.orphans:
if orphan.owner is None and orphan not in self.fgraph.inputs:
if isinstance(orphan, Constant):
subd[orphan] = orphan.type.c_literal(orphan.data)
else:
raise ValueError(
"All orphans in the fgraph to Composite must"
" be Constant instances.")
for var in self.fgraph.variables:
if var.owner is None:
if var not in self.fgraph.inputs:
# This is an orphan
if isinstance(var, Constant):
subd[var] = var.type.c_literal(var.data)
else:
raise ValueError(
"All orphans in the fgraph to Composite must"
" be Constant instances.")
elif (any(i.dtype == 'float16' for i in var.owner.inputs) or
any(o.dtype == 'float16' for o in var.owner.outputs)):
# flag for elemwise ops to check.
self.inner_float16 = True
_c_code = "{\n"
self.nodenames = ["%(nodename)s_" + ('subnode%i' % j)
......
......@@ -2370,6 +2370,9 @@ class CastTester(utt.InferShapeTester):
for format in sparse.sparse_formats:
for i_dtype in sparse.float_dtypes:
for o_dtype in tensor.float_dtypes:
if o_dtype == 'float16':
# Don't test float16 output.
continue
_, data = sparse_random_inputs(
format,
shape=(4, 7),
......
......@@ -252,10 +252,10 @@ class NumpyAutocaster(object):
return numpy.asarray(x)
elif config.cast_policy == 'numpy+floatX':
rval = numpy.asarray(x)
if ((rval.dtype == 'float64' and # numpy wants float64
config.floatX == 'float32' and # but we prefer float32
not hasattr(x, 'dtype'))): # and `x` was not typed
rval = theano._asarray(rval, dtype='float32')
if ((not hasattr(x, 'dtype') and
rval.dtype in ('float64', 'float32') and
rval.dtype != config.floatX)):
rval = theano._asarray(rval, dtype=config.floatX)
return rval
# The following is the original code, corresponding to the 'custom'
......@@ -278,11 +278,14 @@ class NumpyAutocaster(object):
# recall: float is numpy.float
if ((isinstance(x, float) and
config.floatX in self.dtypes and
config.floatX == 'float32')):
config.floatX != 'float64')):
return theano._asarray(x, dtype=config.floatX)
return theano._asarray(x, dtype='float32')
# Don't autocast to float16 unless config.floatX is float16
try_dtypes = [d for d in self.dtypes
if config.floatX == 'float16' or d != 'float16']
for dtype in self.dtypes:
for dtype in try_dtypes:
x_ = theano._asarray(x, dtype=dtype)
if numpy.all(x == x_):
break
......@@ -290,7 +293,7 @@ class NumpyAutocaster(object):
return x_
autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
autocast_float = NumpyAutocaster(('float32', 'float64'))
autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
# autocast_float dtypes might be manipulated in tensor.__init__
......@@ -313,7 +316,7 @@ class autocast_float_as(object):
If `config.cast_policy` is not 'custom', an exception is raised.
For example:
>>> with autocast_float_as('float32') as _dummy:
>>> with autocast_float_as('float32'):
... assert (fvector() + 1.1).dtype == 'float32' # temporary downcasting
>>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
......@@ -1137,6 +1140,10 @@ _convert_to_uint64 = _conversion(
elemwise.Elemwise(scal.convert_to_uint64), 'uint64')
"""Cast to unsigned 64-bit integer"""
_convert_to_float16 = _conversion(
elemwise.Elemwise(scal.convert_to_float16), 'float16')
"""Cast to half-precision floating point"""
_convert_to_float32 = _conversion(
elemwise.Elemwise(scal.convert_to_float32), 'float32')
"""Cast to single-precision floating point"""
......@@ -1162,6 +1169,7 @@ _cast_mapping = {
'uint16': _convert_to_uint16,
'uint32': _convert_to_uint32,
'uint64': _convert_to_uint64,
'float16': _convert_to_float16,
'float32': _convert_to_float32,
'float64': _convert_to_float64,
'complex64': _convert_to_complex64,
......@@ -2757,9 +2765,13 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
# sum() will complain if it is not suitable.
sum_dtype = dtype
else:
# Let sum() infer the appropriate dtype.
sum_dtype = None
# float16 overflows way too fast for sum
if ((sum_dtype == 'float16' or input.dtype == 'float16') and
acc_dtype != 'float16'):
sum_dtype == 'float32'
s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
acc_dtype=acc_dtype)
shp = shape(input)
......@@ -2767,7 +2779,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
# Cast shp into a float type
# TODO Once we have a consistent casting policy, we could simply
# use true_div.
if s.dtype in ('float32', 'complex64'):
if s.dtype in ('float16', 'float32', 'complex64'):
shp = cast(shp, 'float32')
else:
shp = cast(shp, 'float64')
......@@ -2785,6 +2797,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
for i in axis:
s = true_div(s, shp[i])
if dtype == 'float16' or (dtype is None and input.dtype == 'float16'):
s = cast(s, 'float16')
return s
......
......@@ -417,7 +417,8 @@ class Gemv(Op):
def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs
if have_fblas and y.shape[0] != 0 and x.shape[0] != 0:
if (have_fblas and y.shape[0] != 0 and x.shape[0] != 0 and
y.dtype in _blas_gemv_fns):
gemv = _blas_gemv_fns[y.dtype]
if (A.shape[0] != y.shape[0] or A.shape[1] != x.shape[0]):
......@@ -1727,7 +1728,7 @@ def local_dot_to_dot22(node):
x, y, x.type, y.type)
return
if y.type.dtype.startswith('float') or y.type.dtype.startswith('complex'):
if y.type.dtype in ['float32', 'float64', 'complex64', 'complex128']:
if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)]
......
......@@ -95,6 +95,7 @@ class DimShuffle(Op):
transpose function.
Adding, subtracting dimensions can be done with reshape.
"""
_f16_ok = True
check_input = False
......@@ -1171,6 +1172,12 @@ class Elemwise(OpenMPOp):
return decl, checks, alloc, loop
def c_code(self, node, nodename, inames, onames, sub):
if (any(i.dtype == 'float16' for i in node.inputs) or
any(o.dtype == 'float16' for o in node.outputs) or
# This is for Composite
getattr(self.scalar_op, 'inner_float16', False)):
# Disable C code for float16 vars
super(Elemwise, self).c_code(node, nodename, inames, onames, sub)
code = "\n".join(self._c_all(node, nodename, inames, onames, sub))
return code
......@@ -1186,7 +1193,7 @@ class Elemwise(OpenMPOp):
return support_code
def c_code_cache_version_apply(self, node):
version = [11] # the version corresponding to the c code in this Op
version = [12] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
......@@ -1806,6 +1813,7 @@ class CAReduceDtype(CAReduce):
uint8='uint64',
uint16='uint64',
uint32='uint64',
float16='float32',
float32='float64',
complex64='complex128',
).get(idtype, idtype)
......
......@@ -1746,7 +1746,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# Check z is zeros_like(log(sm))
if not _is_const(z, 0):
return
if z.type not in (dmatrix, fmatrix):
if z.broadcastable != (False, False):
if not (vector_softmax and z.broadcastable == (True, False)):
return
# here we know that we are incrementing a matrix of zeros
......@@ -1758,14 +1758,15 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes:
return
# here we know that we are incrementing some part of matrix z by a vector
# here we know that we are incrementing some part of
# matrix z by a vector
# unless the user has taken care to mark that the data and labels have the
# same number of rows, we cannot be sure here that
# len(y) == len(z)
# However, in the common case that these are predictions and labels it is true.
# We leave it to the Op to crash (and the user to complain) if this assumption is
# ever not true.
# unless the user has taken care to mark that the data and
# labels have the same number of rows, we cannot be sure
# here that len(y) == len(z) However, in the common case
# that these are predictions and labels it is true. We
# leave it to the Op to crash (and the user to complain)
# if this assumption is ever not true.
out_grad = -incr
......
......@@ -1564,6 +1564,7 @@ class Assert(T.Op):
used in the function computing the graph, but it doesn't have to be
returned.
"""
__props__ = ('msg',)
view_map = {0: [0]}
check_input = False
......@@ -1583,24 +1584,18 @@ class Assert(T.Op):
assert numpy.all([c.type.ndim == 0 for c in cond])
return gof.Apply(self, [value] + cond, [value.type()])
def __str__(self):
return self.__class__.__name__
def perform(self, node, inputs, out_):
out, = out_
v = inputs[0]
out[0] = v
assert numpy.all(inputs[1:]), self.msg
def __eq__(self, other):
return type(self) == type(other) and self.msg == other.msg
def __hash__(self):
return hash(type(self)) ^ hash(self.msg)
def grad(self, input, output_gradients):
return output_gradients + [DisconnectedType()()] * (len(input) - 1)
def connection_pattern(self, node):
return [[1]] + [[0]] * (len(node.inputs) - 1)
def c_code(self, node, name, inames, onames, sub):
value = inames[0]
out = onames[0]
......
......@@ -5820,40 +5820,24 @@ def _test_autocast_custom():
orig_autocast = autocast_float.dtypes
# Test that autocast_float_as sets the autocast dtype correctly
try: # ghetto 2.4 version of with
ac = autocast_float_as('float32')
ac.__enter__()
with autocast_float_as('float32'):
assert autocast_float.dtypes == ('float32',)
finally:
ac.__exit__()
assert autocast_float.dtypes == orig_autocast
try: # ghetto 2.4 version of with
ac = autocast_float_as('float64')
ac.__enter__()
with autocast_float_as('float64'):
assert autocast_float.dtypes == ('float64',)
finally:
ac.__exit__()
assert autocast_float.dtypes == orig_autocast
# Test that we can set it back to something, and nest it
try: # ghetto 2.4 version of with
ac = autocast_float_as('float32')
ac.__enter__()
with autocast_float_as('float32'):
assert autocast_float.dtypes == ('float32',)
try: # ghetto 2.4 version of with
ac2 = autocast_float_as('float64')
ac2.__enter__()
with autocast_float_as('float64'):
assert autocast_float.dtypes == ('float64',)
finally:
ac2.__exit__()
assert autocast_float.dtypes == ('float32',)
finally:
ac.__exit__()
assert autocast_float.dtypes == orig_autocast
# Test that the autocasting dtype is used correctly in expression-building
try: # ghetto 2.4 version of with
ac = autocast_float_as('float32')
ac.__enter__()
with autocast_float_as('float32'):
assert (dvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.1).dtype == 'float32'
assert (fvector() + theano._asarray(1.1, dtype='float64')).dtype == \
......@@ -5863,13 +5847,9 @@ def _test_autocast_custom():
assert (dvector() + 1).dtype == 'float64'
assert (fvector() + 1).dtype == 'float32'
finally:
ac.__exit__()
# Test that the autocasting dtype is used correctly in expression-building
try: # ghetto 2.4 version of with
ac = autocast_float_as('float64')
ac.__enter__()
with autocast_float_as('float64'):
assert (dvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.0).dtype == 'float64'
......@@ -5880,13 +5860,9 @@ def _test_autocast_custom():
assert (dvector() + 1).dtype == 'float64'
assert (fvector() + 1).dtype == 'float32'
finally:
ac.__exit__()
# Test that the autocasting dtype is used correctly in expression-building
try: # ghetto 2.4 version of with
ac = autocast_float_as('float32', 'float64')
ac.__enter__()
with autocast_float_as('float32', 'float64'):
assert (dvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.1).dtype == theano.config.floatX
assert (fvector() + 1.0).dtype == 'float32'
......@@ -5903,14 +5879,8 @@ def _test_autocast_custom():
assert (ivector() + numpy.int8(1)).dtype == 'int32'
assert (wvector() + numpy.int8(1)).dtype == 'int16'
assert (bvector() + numpy.int8(1)).dtype == 'int8'
try: # ghetto 2.4 version of with
ac2 = autocast_float_as('float64')
ac2.__enter__()
with autocast_float_as('float64'):
assert (fvector() + 1.0).dtype == 'float64'
finally:
ac2.__exit__()
finally:
ac.__exit__()
def _test_autocast_numpy():
......@@ -6036,17 +6006,8 @@ class test_arithmetic_cast(unittest.TestCase):
config.int_division == 'raise')
# This is the expected behavior.
continue
# For numpy we have a problem:
# http://projects.scipy.org/numpy/ticket/1827
# As a result we only consider the highest data
# type that numpy may return.
numpy_dtypes = [
op(numpy_args[0](a_type),
numpy_args[1](b_type)).dtype,
op(numpy_args[1](b_type),
numpy_args[0](a_type)).dtype]
numpy_dtype = theano.scalar.upcast(
*map(str, numpy_dtypes))
numpy_dtype = op(numpy_args[0](a_type),
numpy_args[1](b_type)).dtype
if numpy_dtype == theano_dtype:
# Same data type found, all is good!
continue
......@@ -6078,9 +6039,7 @@ class test_arithmetic_cast(unittest.TestCase):
# Theano upcasted the result array.
theano_dtype == up_type and
# But Numpy kept its original type.
# (not an equality because of numpy bug
# mentioned above).
array_type in numpy_dtypes):
array_type == numpy_dtype):
# Then we accept this difference in
# behavior.
continue
......@@ -6092,17 +6051,20 @@ class test_arithmetic_cast(unittest.TestCase):
numpy.__version__.split('.')[:2]]
if (cfg == 'numpy+floatX' and
a_type == 'complex128' and
b_type == 'float32' and
(b_type == 'float32' or
b_type == 'float16') and
combo == ('scalar', 'array') and
bool(numpy_version >= [1, 6]) and
theano_dtype == 'complex128' and
numpy_dtypes == ['complex64',
'complex64']):
# In numpy 1.6.x adding a complex128 with
# a float32 may result in a complex64. This
# may be a bug (investigation is currently
# in progress), so in the meantime we just
# mark this test as a known failure.
numpy_dtype == 'complex64'):
# In numpy 1.6.x adding a
# complex128 with a float32 or
# float16 may result in a
# complex64. This may be a bug
# (investigation is currently in
# progress), so in the meantime we
# just mark this test as a known
# failure.
raise KnownFailureTest('Known issue with '
'numpy >= 1.6.x see #761')
......
......@@ -1024,6 +1024,7 @@ class T_prod_without_zeros_dtype(unittest.TestCase):
uint8='uint64',
uint16='uint64',
uint32='uint64',
float16='float32',
float32='float64',
complex64='complex128'
).get(dtype, dtype)
......
......@@ -234,6 +234,7 @@ class TensorType(Type):
# complex64, etc.
try:
return {
'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
......
......@@ -6,6 +6,7 @@ from nose.plugins.skip import SkipTest
import os
from fnmatch import fnmatch
import theano
from theano.compat import PY3
try:
import flake8.engine
import flake8.main
......@@ -227,7 +228,6 @@ whitelist_flake8 = [
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/opt.py",
"sandbox/gpuarray/blas.py",
"sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py",
......@@ -347,6 +347,8 @@ def test_format_flake8():
"""
if not flake8_available:
raise SkipTest("flake8 is not installed")
if PY3:
raise SkipTest("not testing in python3 since 2to3 ran")
total_errors = 0
for path in list_files():
rel_path = os.path.relpath(path, theano.__path__[0])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论