提交 1547ecc6 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2800 from abergeron/fp16_merge

Fp16 merge
...@@ -212,6 +212,21 @@ There are less methods to define for an Op than for a Type: ...@@ -212,6 +212,21 @@ There are less methods to define for an Op than for a Type:
Op *must* have a `context_type` property with the Type to use Op *must* have a `context_type` property with the Type to use
for the context variable. for the context variable.
.. attribute:: _f16_ok
(optional) If this attribute is absent or evaluates to `False`,
C code will be disabled for the op if any of its inputs or
outputs contains float16 data. This is added as a check to make
sure we don't compute wrong results since there is no hardware
float16 type so special care must be taken to make sure
operations are done correctly.
If you don't intend to deal with float16 data you can leave
this undefined.
This attribute is internal and may go away at any point during
developpment if a better solution is found.
The ``name`` argument is currently given an invalid value, so steer The ``name`` argument is currently given an invalid value, so steer
away from it. As was the case with Type, ``sub['fail']`` provides away from it. As was the case with Type, ``sub['fail']`` provides
failure code that you *must* use if you want to raise an exception, failure code that you *must* use if you want to raise an exception,
......
...@@ -215,6 +215,8 @@ class Shape(gof.Op): ...@@ -215,6 +215,8 @@ class Shape(gof.Op):
@note: Non-differentiable. @note: Non-differentiable.
""" """
_f16_ok = True
# Mapping from Type to C code (and version) to use. # Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s, # In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s. # the output variable is %(oname)s.
...@@ -308,6 +310,8 @@ class Shape_i(gof.Op): ...@@ -308,6 +310,8 @@ class Shape_i(gof.Op):
@note: Non-differentiable. @note: Non-differentiable.
""" """
_f16_ok = True
# Mapping from Type to C code (and version) to use. # Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s, # In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s. # the output variable is %(oname)s.
......
...@@ -18,12 +18,17 @@ def floatX_convert(s): ...@@ -18,12 +18,17 @@ def floatX_convert(s):
return "float32" return "float32"
elif s == "64": elif s == "64":
return "float64" return "float64"
elif s == "16":
return "float16"
else: else:
return s return s
AddConfigVar('floatX', AddConfigVar('floatX',
"Default floating-point precision for python casts", "Default floating-point precision for python casts.\n"
EnumStr('float64', 'float32', convert=floatX_convert,), "\n"
"Note: float16 support is experimental, use at your own risk.",
EnumStr('float64', 'float32', 'float16',
convert=floatX_convert,),
) )
AddConfigVar('warn_float64', AddConfigVar('warn_float64',
...@@ -39,7 +44,7 @@ AddConfigVar('cast_policy', ...@@ -39,7 +44,7 @@ AddConfigVar('cast_policy',
EnumStr('custom', 'numpy+floatX', EnumStr('custom', 'numpy+floatX',
# The 'numpy' policy was originally planned to provide a # The 'numpy' policy was originally planned to provide a
# smooth transition from numpy. It was meant to behave the # smooth transition from numpy. It was meant to behave the
# same asnumpy+floatX, but keeping float64 when numpy # same as numpy+floatX, but keeping float64 when numpy
# would. However the current implementation of some cast # would. However the current implementation of some cast
# mechanisms makes it a bit more complex to add than what # mechanisms makes it a bit more complex to add than what
# was expected, so it is currently not available. # was expected, so it is currently not available.
......
...@@ -726,9 +726,20 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -726,9 +726,20 @@ class Op(utils.object2, PureOp, CLinkerOp):
node_output_storage = [storage_map[r] for r in node.outputs] node_output_storage = [storage_map[r] for r in node.outputs]
node_input_compute = [compute_map[r] for r in node.inputs] node_input_compute = [compute_map[r] for r in node.inputs]
node_output_compute = [compute_map[r] for r in node.outputs] node_output_compute = [compute_map[r] for r in node.outputs]
#logger.debug('Compiling node %i of graph' % node_idx)
if self._op_use_c_code: if self._op_use_c_code:
try: try:
# float16 get special treatment since running
# unprepared C code will get bad results.
if not getattr(self, '_f16_ok', False):
def is_f16(t):
return getattr(t, 'dtype', '') == 'float16'
if (any(is_f16(i.type) for i in node.inputs) or
any(is_f16(o.type) for o in node.outputs)):
print ("Disabling C code for %s due to unsupported "
"float16" % (self,))
raise NotImplementedError("float16")
e = FunctionGraph(node.inputs, node.outputs) e = FunctionGraph(node.inputs, node.outputs)
e_no_recycling = [new_o e_no_recycling = [new_o
......
...@@ -20,6 +20,7 @@ except ImportError: ...@@ -20,6 +20,7 @@ except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType
from .fp16_help import write_w
def as_gpuarray_variable(x): def as_gpuarray_variable(x):
...@@ -186,11 +187,8 @@ class GpuKernelBase(object): ...@@ -186,11 +187,8 @@ class GpuKernelBase(object):
class HostFromGpu(Op): class HostFromGpu(Op):
def __eq__(self, other): __props__ = ()
return type(self) == type(other) _f16_ok = True
def __hash__(self):
return hash(type(self))
def __str__(self): def __str__(self):
return 'HostFromGpu(gpuarray)' return 'HostFromGpu(gpuarray)'
...@@ -269,11 +267,8 @@ host_from_gpu = HostFromGpu() ...@@ -269,11 +267,8 @@ host_from_gpu = HostFromGpu()
class GpuFromHost(Op): class GpuFromHost(Op):
def __eq__(self, other): __props__ = ()
return type(self) == type(other) _f16_ok = True
def __hash__(self):
return hash(type(self))
def __str__(self): def __str__(self):
return 'GpuFromHost(gpuarray)' return 'GpuFromHost(gpuarray)'
...@@ -573,18 +568,15 @@ cuda_from_gpu = CudaFromGpu() ...@@ -573,18 +568,15 @@ cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
__props__ = ('memset_0',)
_f16_ok = True
def __init__(self, memset_0=False): def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the """memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster. value is always 0, so the c code call memset as it is faster.
""" """
self.memset_0 = memset_0 self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self): def __str__(self):
# Hide the memset parameter when not used to prevent confusion. # Hide the memset parameter when not used to prevent confusion.
if self.memset_0: if self.memset_0:
...@@ -728,25 +720,17 @@ class GpuContiguous(Op): ...@@ -728,25 +720,17 @@ class GpuContiguous(Op):
Always return a c contiguous output. Copy the input only if it is Always return a c contiguous output. Copy the input only if it is
not already c contiguous. not already c contiguous.
""" """
__props__ = ()
view_map = {0: [0]} view_map = {0: [0]}
_f16_ok = True
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def grad(self, inputs, dout): def grad(self, inputs, dout):
x, = inputs x, = inputs
dout, = dout dout, = dout
dout = as_gpuarray_variable(dout) dout = as_gpuarray_variable(dout)
return [dout] return [dout]
def __str__(self):
return self.__class__.__name__
def make_node(self, input): def make_node(self, input):
input = as_gpuarray_variable(input) input = as_gpuarray_variable(input)
return Apply(self, [input], [input.type()]) return Apply(self, [input], [input.type()])
...@@ -794,6 +778,8 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -794,6 +778,8 @@ class GpuReshape(HideC, tensor.Reshape):
""" """
Implement Reshape on the gpu. Implement Reshape on the gpu.
""" """
_f16_ok = True
# __hash__, __eq__, __str__ come from tensor.Reshape # __hash__, __eq__, __str__ come from tensor.Reshape
def make_node(self, x, shp): def make_node(self, x, shp):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
...@@ -831,6 +817,8 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -831,6 +817,8 @@ class GpuReshape(HideC, tensor.Reshape):
class GpuJoin(HideC, Join): class GpuJoin(HideC, Join):
_f16_ok = True
def make_node(self, axis, *tensors): def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors) node = Join.make_node(self, axis, *tensors)
...@@ -888,6 +876,9 @@ class GpuSplit(HideC, Split): ...@@ -888,6 +876,9 @@ class GpuSplit(HideC, Split):
class GpuEye(GpuKernelBase, Op): class GpuEye(GpuKernelBase, Op):
__props__ = ('dtype',)
_f16_ok = True
def __init__(self, dtype=None): def __init__(self, dtype=None):
if dtype is None: if dtype is None:
dtype = config.floatX dtype = config.floatX
...@@ -915,20 +906,15 @@ class GpuEye(GpuKernelBase, Op): ...@@ -915,20 +906,15 @@ class GpuEye(GpuKernelBase, Op):
return [grad_undefined(self, i, inp[i]) return [grad_undefined(self, i, inp[i])
for i in xrange(3)] for i in xrange(3)]
def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
code = """ code = """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m; ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) { for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1; a[i*m + i] = %(write_a)s(1);
} }
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name) }""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name, write_a=write_w(self.dtype))
return [Kernel( return [Kernel(
code=code, name="k", code=code, name="k",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE], params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
......
def work_dtype(dtype):
if dtype == 'float16':
return 'float32'
else:
return dtype
def load_w(dtype):
if dtype == 'float16':
return '__half2float'
else:
return ''
def write_w(dtype):
if dtype == 'float16':
return '__float2half_rn'
else:
return ''
...@@ -16,6 +16,7 @@ from .type import GpuArrayType ...@@ -16,6 +16,7 @@ from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel, from .kernel_codegen import (nvcc_kernel,
inline_softmax, inline_softmax,
inline_softmax_fixed_shared) inline_softmax_fixed_shared)
from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
...@@ -24,15 +25,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -24,15 +25,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
""" """
nin = 3 nin = 3
nout = 3 nout = 3
__props__ = ()
def __eq__(self, other): _f16_ok = True
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x, b, y_idx): def make_node(self, x, b, y_idx):
# N.B. won't work when we don't cast y_idx to float anymore # N.B. won't work when we don't cast y_idx to float anymore
...@@ -52,6 +46,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -52,6 +46,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
work_x = work_dtype(dtype_x)
work_b = work_dtype(dtype_b)
load_x = load_w(dtype_x)
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
return """ return """
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N, __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
const npy_%(dtype_x)s* x_data, int xs0, int xs1, const npy_%(dtype_x)s* x_data, int xs0, int xs1,
...@@ -67,12 +67,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -67,12 +67,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0]; const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row; npy_%(dtype_x)s* sm = sm_data + sms0 * row;
npy_%(dtype_x)s sum = 0.0; npy_%(work_x)s sum = 0.0;
int row_max_j = 0; int row_max_j = 0;
npy_%(dtype_x)s row_max = x[0] + b[0]; npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
for (int j = 1; j < N; ++j) for (int j = 1; j < N; ++j)
{ {
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0]; npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
//todo: store to shared memory //todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j; row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max; row_max = (row_ij > row_max) ? row_ij : row_max;
...@@ -80,27 +81,30 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -80,27 +81,30 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp //compute the exp
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0]; npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
npy_%(dtype_x)s sm_ij = exp(row_ij - row_max); %(load_b)s(b[j*bs0]);
npy_%(work_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij; sum += sm_ij;
sm[j * sms1] = sm_ij; sm[j * sms1] = %(write_x)s(sm_ij);
} }
npy_%(dtype_x)s sum_inv = 1.0 / sum; npy_%(work_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
sm[j * sms1] *= sum_inv; npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]);
__tmp *= sum_inv;
sm[j * sms1] = %(write_x)s(__tmp);
} }
if ((y_idx >= N) || (y_idx < 0)) if ((y_idx >= N) || (y_idx < 0))
{ {
//TODO: set raise an error bit in a global var? //TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = 0.0; // raise some suspicion at least... nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
} }
else else
{ {
nll_data[row*nlls0] = - x[y_idx*xs1] nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
- b[y_idx*bs0] - %(load_b)s(b[y_idx*bs0])
+ row_max + row_max
+ log(sum); + log(sum));
} }
am_data[row*ams0] = row_max_j; am_data[row*ams0] = row_max_j;
} }
...@@ -259,8 +263,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -259,8 +263,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
# return () return (6,)
return (5,)
def c_compiler(self): def c_compiler(self):
return NVCC_compiler return NVCC_compiler
...@@ -272,21 +275,13 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho ...@@ -272,21 +275,13 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho
class GpuCrossentropySoftmax1HotWithBiasDx(Op): class GpuCrossentropySoftmax1HotWithBiasDx(Op):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
Gradient wrt x of the CrossentropySoftmax1Hot Op
""" """
nin = 3 nin = 3
nout = 1 nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op""" __props__ = ()
def __init__(self, **kwargs): _f16_ok = True
Op.__init__(self, **kwargs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, dnll, sm, y_idx): def make_node(self, dnll, sm, y_idx):
dnll = as_gpuarray_variable(dnll) dnll = as_gpuarray_variable(dnll)
...@@ -295,8 +290,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -295,8 +290,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
# return () return (9,)
return (8,)
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>'] return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
...@@ -421,6 +415,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -421,6 +415,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
dtype_sm = node.inputs[1].dtype dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype dtype_dx = node.outputs[0].dtype
work_dnll = work_dtype(dtype_dnll)
load_dnll = load_w(dtype_dnll)
load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx)
return """ return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s( __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K, int N, int K,
...@@ -431,7 +429,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -431,7 +429,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
{ {
for (int i = blockIdx.x; i < N; i += gridDim.x) for (int i = blockIdx.x; i < N; i += gridDim.x)
{ {
npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0]; npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0]; npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x) for (int j = threadIdx.x; j < K; j += blockDim.x)
...@@ -439,16 +437,15 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -439,16 +437,15 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
if (y_i == j) if (y_i == j)
{ {
dx[i * dx_s0 + j * dx_s1] = dx[i * dx_s0 + j * dx_s1] =
dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0); %(write_dx)s(dnll_i *
(%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
} }
else else
{ {
dx[i * dx_s0 + j * dx_s1] = dx[i * dx_s0 + j * dx_s1] =
dnll_i * sm[i * sm_s0 + j * sm_s1]; %(write_dx)s(dnll_i *
%(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
} }
//dx[i * dx_s0 + j * dx_s1] =
// dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*dx_s0+j*dx_s1] = 0;
} }
} }
} }
...@@ -466,14 +463,7 @@ class GpuSoftmax (Op): ...@@ -466,14 +463,7 @@ class GpuSoftmax (Op):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
def __eq__(self, other): __props__ = ()
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
...@@ -658,15 +648,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -658,15 +648,7 @@ class GpuSoftmaxWithBias (Op):
""" """
nin = 2 nin = 2
nout = 1 nout = 1
__props__ = ()
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x, b): def make_node(self, x, b):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
......
...@@ -10,9 +10,7 @@ except ImportError: ...@@ -10,9 +10,7 @@ except ImportError:
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB, SequenceDB, Optimizer, toolbox)
Optimizer, toolbox,
InconsistencyError, EquilibriumOptimizer)
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
...@@ -28,8 +26,7 @@ from .conv import GpuConv ...@@ -28,8 +26,7 @@ from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
from .elemwise import (GpuElemwise, _is_scalar, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY) GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
...@@ -134,7 +131,7 @@ class InputToGpuOptimizer(Optimizer): ...@@ -134,7 +131,7 @@ class InputToGpuOptimizer(Optimizer):
new_input = host_from_gpu(gpu_from_host(input)) new_input = host_from_gpu(gpu_from_host(input))
fgraph.replace_validate(input, new_input, fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer") "InputToGpuOptimizer")
except TypeError as e: except TypeError:
# This could fail if the inputs are not TensorTypes # This could fail if the inputs are not TensorTypes
pass pass
...@@ -253,10 +250,11 @@ def local_gpuflatten(node): ...@@ -253,10 +250,11 @@ def local_gpuflatten(node):
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node): def local_gpu_elemwise(node):
op = node.op op = node.op
scal_op = op.scalar_op
name = op.name name = op.name
if name: if name:
name = 'Gpu'+name name = 'Gpu'+name
res = GpuElemwise(op.scalar_op, name=name, res = GpuElemwise(scal_op, name=name,
inplace_pattern=copy.copy(op.inplace_pattern), inplace_pattern=copy.copy(op.inplace_pattern),
nfunc_spec=op.nfunc_spec) nfunc_spec=op.nfunc_spec)
return res return res
...@@ -391,7 +389,6 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -391,7 +389,6 @@ def local_gpua_advanced_incsubtensor(node):
return None return None
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
coords = node.inputs[2:]
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties device_properties = theano.sandbox.cuda.device_properties
...@@ -640,8 +637,7 @@ def local_gpu_elemwise_careduce(node): ...@@ -640,8 +637,7 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result # operation with some reduction pattern will probably result
# to slow down. # to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr) isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op, return [GpuCAReduceCuda(scalar_op=op.scalar_op,
......
...@@ -21,6 +21,8 @@ from .comp import NVCC_compiler ...@@ -21,6 +21,8 @@ from .comp import NVCC_compiler
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
_f16_ok = True
def make_node(self, x, *inputs): def make_node(self, x, *inputs):
rval = tensor.Subtensor.make_node(self, x, *inputs) rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
......
...@@ -436,7 +436,7 @@ def test_gpueye(): ...@@ -436,7 +436,7 @@ def test_gpueye():
assert any([isinstance(node.op, GpuEye) assert any([isinstance(node.op, GpuEye)
for node in f.maker.fgraph.toposort()]) for node in f.maker.fgraph.toposort()])
for dtype in ['float32', 'int32']: for dtype in ['float32', 'int32', 'float16']:
yield check, dtype, 3 yield check, dtype, 3
# M != N, k = 0 # M != N, k = 0
yield check, dtype, 3, 5 yield check, dtype, 3, 5
......
...@@ -185,6 +185,7 @@ class GpuArrayType(Type): ...@@ -185,6 +185,7 @@ class GpuArrayType(Type):
# complex64, etc. # complex64, etc.
try: try:
return { return {
'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
'float32': (float, 'npy_float32', 'NPY_FLOAT32'), 'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
'float64': (float, 'npy_float64', 'NPY_FLOAT64'), 'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
'uint8': (int, 'npy_uint8', 'NPY_UINT8'), 'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
...@@ -309,7 +310,11 @@ class GpuArrayConstant(_operators, Constant): ...@@ -309,7 +310,11 @@ class GpuArrayConstant(_operators, Constant):
def __str__(self): def __str__(self):
if self.name is not None: if self.name is not None:
return self.name return self.name
return "GpuArrayConstant{%s}" % numpy.asarray(self.data) try:
np_data = numpy.asarray(self.data)
except gpuarray.GpuArrayException:
np_data = self.data
return "GpuArrayConstant{%s}" % np_data
GpuArrayType.Constant = GpuArrayConstant GpuArrayType.Constant = GpuArrayConstant
......
...@@ -613,18 +613,15 @@ def test_binomial(): ...@@ -613,18 +613,15 @@ def test_binomial():
# test empty size (scalar) # test empty size (scalar)
((), (), [], []), ((), (), [], []),
]: ]:
yield (t_binomial, mean, size, const_size, var_input, input,
steps, rtol)
# print ''
# print 'ON CPU with size=(%s) and mean(%d):' % (str(size), mean) def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234, use_cuda=False)
# Note: we specify `nstreams` to avoid a warning. u = R.binomial(size=size, p=mean)
u = R.binomial(size=size, p=mean,
nstreams=rng_mrg.guess_n_streams(size, warn=False))
f = theano.function(var_input, u, mode=mode) f = theano.function(var_input, u, mode=mode)
# theano.printing.debugprint(f)
out = f(*input) out = f(*input)
# print 'random?[:10]\n', out[0, 0:10]
# print 'random?[-1,-10:]\n', out[-1, -10:]
# Increase the number of steps if sizes implies only a few samples # Increase the number of steps if sizes implies only a few samples
if numpy.prod(const_size) < 10: if numpy.prod(const_size) < 10:
...@@ -636,30 +633,21 @@ def test_binomial(): ...@@ -636,30 +633,21 @@ def test_binomial():
target_avg=mean, mean_rtol=rtol) target_avg=mean, mean_rtol=rtol)
if mode != 'FAST_COMPILE' and cuda_available: if mode != 'FAST_COMPILE' and cuda_available:
# print ''
# print 'ON GPU with size=(%s) and mean(%d):' % (str(size), mean)
R = MRG_RandomStreams(234, use_cuda=True) R = MRG_RandomStreams(234, use_cuda=True)
u = R.binomial(size=size, p=mean, dtype='float32', u = R.binomial(size=size, p=mean, dtype='float32')
nstreams=rng_mrg.guess_n_streams(size,
warn=False))
# well, it's really that this test w GPU doesn't make sense otw # well, it's really that this test w GPU doesn't make sense otw
assert u.dtype == 'float32' assert u.dtype == 'float32'
f = theano.function(var_input, theano.Out( f = theano.function(var_input, theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(u), theano.sandbox.cuda.basic_ops.gpu_from_host(u),
borrow=True), mode=mode_with_gpu) borrow=True), mode=mode_with_gpu)
# theano.printing.debugprint(f)
gpu_out = numpy.asarray(f(*input)) gpu_out = numpy.asarray(f(*input))
# print 'random?[:10]\n', gpu_out[0, 0:10]
# print 'random?[-1,-10:]\n', gpu_out[-1, -10:]
basictest(f, steps_, const_size, prefix='mrg gpu', basictest(f, steps_, const_size, prefix='mrg gpu',
inputs=input, allow_01=True, inputs=input, allow_01=True,
target_avg=mean, mean_rtol=rtol) target_avg=mean, mean_rtol=rtol)
numpy.testing.assert_array_almost_equal(out, gpu_out, numpy.testing.assert_array_almost_equal(out, gpu_out,
decimal=6) decimal=6)
# print ''
# print 'ON CPU w NUMPY with size=(%s) and mean(%d):' % (str(size),
# mean)
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
uu = RR.binomial(size=size, p=mean) uu = RR.binomial(size=size, p=mean)
......
...@@ -50,25 +50,33 @@ class IntegerDivisionError(Exception): ...@@ -50,25 +50,33 @@ class IntegerDivisionError(Exception):
def upcast(dtype, *dtypes): def upcast(dtype, *dtypes):
# Should we try to keep float32 instead of float64? This is used so that # This tries to keep data in floatX or lower precision, unless we
# for instance mixing int64 with float32 yields float32 instead of float64. # explicitely request a higher precision datatype.
# Note that we store this boolean as a one-element list so that it can be
# modified within `make_array`.
keep_float32 = [(config.cast_policy == 'numpy+floatX' and keep_float32 = [(config.cast_policy == 'numpy+floatX' and
config.floatX == 'float32')] config.floatX == 'float32')]
keep_float16 = [(config.cast_policy == 'numpy+floatX' and
config.floatX == 'float16')]
def make_array(dt): def make_array(dt):
if dt == 'float64': if dt == 'float64':
# There is an explicit float64 dtype: we cannot keep float32. # There is an explicit float64 dtype: we cannot keep float32.
keep_float32[0] = False keep_float32[0] = False
keep_float16[0] = False
if dt == 'float32':
keep_float16[0] = False
return numpy.zeros((), dtype=dt) return numpy.zeros((), dtype=dt)
z = make_array(dtype) z = make_array(dtype)
for dt in dtypes: for dt in dtypes:
z = z + make_array(dt=dt) z = z + make_array(dt=dt)
rval = str(z.dtype) rval = str(z.dtype)
if rval == 'float64' and keep_float32[0]: if rval == 'float64':
if keep_float16[0]:
return 'float16'
if keep_float32[0]:
return 'float32' return 'float32'
else: elif rval == 'float32':
if keep_float16[0]:
return 'float16'
return rval return rval
...@@ -232,6 +240,7 @@ class Scalar(Type): ...@@ -232,6 +240,7 @@ class Scalar(Type):
print(dtype, np.zeros(1, dtype=dtype).dtype.num) print(dtype, np.zeros(1, dtype=dtype).dtype.num)
""" """
return { # dtype: (py_type, c_type, cls_name) return { # dtype: (py_type, c_type, cls_name)
'float16': (numpy.float16, 'npy_float16', 'Float16'),
'float32': (numpy.float32, 'npy_float32', 'Float32'), 'float32': (numpy.float32, 'npy_float32', 'Float32'),
'float64': (numpy.float64, 'npy_float64', 'Float64'), 'float64': (numpy.float64, 'npy_float64', 'Float64'),
'complex128': (numpy.complex128, 'theano_complex128', 'complex128': (numpy.complex128, 'theano_complex128',
...@@ -501,6 +510,7 @@ uint8 = get_scalar_type('uint8') ...@@ -501,6 +510,7 @@ uint8 = get_scalar_type('uint8')
uint16 = get_scalar_type('uint16') uint16 = get_scalar_type('uint16')
uint32 = get_scalar_type('uint32') uint32 = get_scalar_type('uint32')
uint64 = get_scalar_type('uint64') uint64 = get_scalar_type('uint64')
float16 = get_scalar_type('float16')
float32 = get_scalar_type('float32') float32 = get_scalar_type('float32')
float64 = get_scalar_type('float64') float64 = get_scalar_type('float64')
complex64 = get_scalar_type('complex64') complex64 = get_scalar_type('complex64')
...@@ -508,7 +518,7 @@ complex128 = get_scalar_type('complex128') ...@@ -508,7 +518,7 @@ complex128 = get_scalar_type('complex128')
int_types = int8, int16, int32, int64 int_types = int8, int16, int32, int64
uint_types = uint8, uint16, uint32, uint64 uint_types = uint8, uint16, uint32, uint64
float_types = float32, float64 float_types = float16, float32, float64
complex_types = complex64, complex128 complex_types = complex64, complex128
discrete_types = int_types + uint_types discrete_types = int_types + uint_types
...@@ -1995,6 +2005,7 @@ convert_to_uint8 = Cast(uint8, name='convert_to_uint8') ...@@ -1995,6 +2005,7 @@ convert_to_uint8 = Cast(uint8, name='convert_to_uint8')
convert_to_uint16 = Cast(uint16, name='convert_to_uint16') convert_to_uint16 = Cast(uint16, name='convert_to_uint16')
convert_to_uint32 = Cast(uint32, name='convert_to_uint32') convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
convert_to_uint64 = Cast(uint64, name='convert_to_uint64') convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
convert_to_float16 = Cast(float16, name='convert_to_float16')
convert_to_float32 = Cast(float32, name='convert_to_float32') convert_to_float32 = Cast(float32, name='convert_to_float32')
convert_to_float64 = Cast(float64, name='convert_to_float64') convert_to_float64 = Cast(float64, name='convert_to_float64')
convert_to_complex64 = Cast(complex64, name='convert_to_complex64') convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
...@@ -2009,6 +2020,7 @@ _cast_mapping = { ...@@ -2009,6 +2020,7 @@ _cast_mapping = {
'uint16': convert_to_uint16, 'uint16': convert_to_uint16,
'uint32': convert_to_uint32, 'uint32': convert_to_uint32,
'uint64': convert_to_uint64, 'uint64': convert_to_uint64,
'float16': convert_to_float16,
'float32': convert_to_float32, 'float32': convert_to_float32,
'float64': convert_to_float64, 'float64': convert_to_float64,
'complex64': convert_to_complex64, 'complex64': convert_to_complex64,
...@@ -3286,14 +3298,20 @@ class Composite(ScalarOp): ...@@ -3286,14 +3298,20 @@ class Composite(ScalarOp):
+ zip(self.fgraph.outputs, + zip(self.fgraph.outputs,
["%%(o%i)s" % i for i in xrange(len(self.fgraph.outputs))])) ["%%(o%i)s" % i for i in xrange(len(self.fgraph.outputs))]))
for orphan in self.fgraph.variables: # fgraph.orphans: for var in self.fgraph.variables:
if orphan.owner is None and orphan not in self.fgraph.inputs: if var.owner is None:
if isinstance(orphan, Constant): if var not in self.fgraph.inputs:
subd[orphan] = orphan.type.c_literal(orphan.data) # This is an orphan
if isinstance(var, Constant):
subd[var] = var.type.c_literal(var.data)
else: else:
raise ValueError( raise ValueError(
"All orphans in the fgraph to Composite must" "All orphans in the fgraph to Composite must"
" be Constant instances.") " be Constant instances.")
elif (any(i.dtype == 'float16' for i in var.owner.inputs) or
any(o.dtype == 'float16' for o in var.owner.outputs)):
# flag for elemwise ops to check.
self.inner_float16 = True
_c_code = "{\n" _c_code = "{\n"
self.nodenames = ["%(nodename)s_" + ('subnode%i' % j) self.nodenames = ["%(nodename)s_" + ('subnode%i' % j)
......
...@@ -2370,6 +2370,9 @@ class CastTester(utt.InferShapeTester): ...@@ -2370,6 +2370,9 @@ class CastTester(utt.InferShapeTester):
for format in sparse.sparse_formats: for format in sparse.sparse_formats:
for i_dtype in sparse.float_dtypes: for i_dtype in sparse.float_dtypes:
for o_dtype in tensor.float_dtypes: for o_dtype in tensor.float_dtypes:
if o_dtype == 'float16':
# Don't test float16 output.
continue
_, data = sparse_random_inputs( _, data = sparse_random_inputs(
format, format,
shape=(4, 7), shape=(4, 7),
......
...@@ -252,10 +252,10 @@ class NumpyAutocaster(object): ...@@ -252,10 +252,10 @@ class NumpyAutocaster(object):
return numpy.asarray(x) return numpy.asarray(x)
elif config.cast_policy == 'numpy+floatX': elif config.cast_policy == 'numpy+floatX':
rval = numpy.asarray(x) rval = numpy.asarray(x)
if ((rval.dtype == 'float64' and # numpy wants float64 if ((not hasattr(x, 'dtype') and
config.floatX == 'float32' and # but we prefer float32 rval.dtype in ('float64', 'float32') and
not hasattr(x, 'dtype'))): # and `x` was not typed rval.dtype != config.floatX)):
rval = theano._asarray(rval, dtype='float32') rval = theano._asarray(rval, dtype=config.floatX)
return rval return rval
# The following is the original code, corresponding to the 'custom' # The following is the original code, corresponding to the 'custom'
...@@ -278,11 +278,14 @@ class NumpyAutocaster(object): ...@@ -278,11 +278,14 @@ class NumpyAutocaster(object):
# recall: float is numpy.float # recall: float is numpy.float
if ((isinstance(x, float) and if ((isinstance(x, float) and
config.floatX in self.dtypes and config.floatX in self.dtypes and
config.floatX == 'float32')): config.floatX != 'float64')):
return theano._asarray(x, dtype=config.floatX)
return theano._asarray(x, dtype='float32') # Don't autocast to float16 unless config.floatX is float16
try_dtypes = [d for d in self.dtypes
if config.floatX == 'float16' or d != 'float16']
for dtype in self.dtypes: for dtype in try_dtypes:
x_ = theano._asarray(x, dtype=dtype) x_ = theano._asarray(x, dtype=dtype)
if numpy.all(x == x_): if numpy.all(x == x_):
break break
...@@ -290,7 +293,7 @@ class NumpyAutocaster(object): ...@@ -290,7 +293,7 @@ class NumpyAutocaster(object):
return x_ return x_
autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64')) autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
autocast_float = NumpyAutocaster(('float32', 'float64')) autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
# autocast_float dtypes might be manipulated in tensor.__init__ # autocast_float dtypes might be manipulated in tensor.__init__
...@@ -313,7 +316,7 @@ class autocast_float_as(object): ...@@ -313,7 +316,7 @@ class autocast_float_as(object):
If `config.cast_policy` is not 'custom', an exception is raised. If `config.cast_policy` is not 'custom', an exception is raised.
For example: For example:
>>> with autocast_float_as('float32') as _dummy: >>> with autocast_float_as('float32'):
... assert (fvector() + 1.1).dtype == 'float32' # temporary downcasting ... assert (fvector() + 1.1).dtype == 'float32' # temporary downcasting
>>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
...@@ -1137,6 +1140,10 @@ _convert_to_uint64 = _conversion( ...@@ -1137,6 +1140,10 @@ _convert_to_uint64 = _conversion(
elemwise.Elemwise(scal.convert_to_uint64), 'uint64') elemwise.Elemwise(scal.convert_to_uint64), 'uint64')
"""Cast to unsigned 64-bit integer""" """Cast to unsigned 64-bit integer"""
_convert_to_float16 = _conversion(
elemwise.Elemwise(scal.convert_to_float16), 'float16')
"""Cast to half-precision floating point"""
_convert_to_float32 = _conversion( _convert_to_float32 = _conversion(
elemwise.Elemwise(scal.convert_to_float32), 'float32') elemwise.Elemwise(scal.convert_to_float32), 'float32')
"""Cast to single-precision floating point""" """Cast to single-precision floating point"""
...@@ -1162,6 +1169,7 @@ _cast_mapping = { ...@@ -1162,6 +1169,7 @@ _cast_mapping = {
'uint16': _convert_to_uint16, 'uint16': _convert_to_uint16,
'uint32': _convert_to_uint32, 'uint32': _convert_to_uint32,
'uint64': _convert_to_uint64, 'uint64': _convert_to_uint64,
'float16': _convert_to_float16,
'float32': _convert_to_float32, 'float32': _convert_to_float32,
'float64': _convert_to_float64, 'float64': _convert_to_float64,
'complex64': _convert_to_complex64, 'complex64': _convert_to_complex64,
...@@ -2757,9 +2765,13 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, ...@@ -2757,9 +2765,13 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
# sum() will complain if it is not suitable. # sum() will complain if it is not suitable.
sum_dtype = dtype sum_dtype = dtype
else: else:
# Let sum() infer the appropriate dtype.
sum_dtype = None sum_dtype = None
# float16 overflows way too fast for sum
if ((sum_dtype == 'float16' or input.dtype == 'float16') and
acc_dtype != 'float16'):
sum_dtype == 'float32'
s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims, s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
acc_dtype=acc_dtype) acc_dtype=acc_dtype)
shp = shape(input) shp = shape(input)
...@@ -2767,7 +2779,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, ...@@ -2767,7 +2779,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
# Cast shp into a float type # Cast shp into a float type
# TODO Once we have a consistent casting policy, we could simply # TODO Once we have a consistent casting policy, we could simply
# use true_div. # use true_div.
if s.dtype in ('float32', 'complex64'): if s.dtype in ('float16', 'float32', 'complex64'):
shp = cast(shp, 'float32') shp = cast(shp, 'float32')
else: else:
shp = cast(shp, 'float64') shp = cast(shp, 'float64')
...@@ -2785,6 +2797,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, ...@@ -2785,6 +2797,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
for i in axis: for i in axis:
s = true_div(s, shp[i]) s = true_div(s, shp[i])
if dtype == 'float16' or (dtype is None and input.dtype == 'float16'):
s = cast(s, 'float16')
return s return s
......
...@@ -417,7 +417,8 @@ class Gemv(Op): ...@@ -417,7 +417,8 @@ class Gemv(Op):
def perform(self, node, inputs, out_storage): def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs y, alpha, A, x, beta = inputs
if have_fblas and y.shape[0] != 0 and x.shape[0] != 0: if (have_fblas and y.shape[0] != 0 and x.shape[0] != 0 and
y.dtype in _blas_gemv_fns):
gemv = _blas_gemv_fns[y.dtype] gemv = _blas_gemv_fns[y.dtype]
if (A.shape[0] != y.shape[0] or A.shape[1] != x.shape[0]): if (A.shape[0] != y.shape[0] or A.shape[1] != x.shape[0]):
...@@ -1727,7 +1728,7 @@ def local_dot_to_dot22(node): ...@@ -1727,7 +1728,7 @@ def local_dot_to_dot22(node):
x, y, x.type, y.type) x, y, x.type, y.type)
return return
if y.type.dtype.startswith('float') or y.type.dtype.startswith('complex'): if y.type.dtype in ['float32', 'float64', 'complex64', 'complex128']:
if x.ndim == 2 and y.ndim == 2: if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM" # print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)] return [_dot22(*node.inputs)]
......
...@@ -95,6 +95,7 @@ class DimShuffle(Op): ...@@ -95,6 +95,7 @@ class DimShuffle(Op):
transpose function. transpose function.
Adding, subtracting dimensions can be done with reshape. Adding, subtracting dimensions can be done with reshape.
""" """
_f16_ok = True
check_input = False check_input = False
...@@ -1171,6 +1172,12 @@ class Elemwise(OpenMPOp): ...@@ -1171,6 +1172,12 @@ class Elemwise(OpenMPOp):
return decl, checks, alloc, loop return decl, checks, alloc, loop
def c_code(self, node, nodename, inames, onames, sub): def c_code(self, node, nodename, inames, onames, sub):
if (any(i.dtype == 'float16' for i in node.inputs) or
any(o.dtype == 'float16' for o in node.outputs) or
# This is for Composite
getattr(self.scalar_op, 'inner_float16', False)):
# Disable C code for float16 vars
super(Elemwise, self).c_code(node, nodename, inames, onames, sub)
code = "\n".join(self._c_all(node, nodename, inames, onames, sub)) code = "\n".join(self._c_all(node, nodename, inames, onames, sub))
return code return code
...@@ -1186,7 +1193,7 @@ class Elemwise(OpenMPOp): ...@@ -1186,7 +1193,7 @@ class Elemwise(OpenMPOp):
return support_code return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [11] # the version corresponding to the c code in this Op version = [12] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -1806,6 +1813,7 @@ class CAReduceDtype(CAReduce): ...@@ -1806,6 +1813,7 @@ class CAReduceDtype(CAReduce):
uint8='uint64', uint8='uint64',
uint16='uint64', uint16='uint64',
uint32='uint64', uint32='uint64',
float16='float32',
float32='float64', float32='float64',
complex64='complex128', complex64='complex128',
).get(idtype, idtype) ).get(idtype, idtype)
......
...@@ -1746,7 +1746,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1746,7 +1746,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# Check z is zeros_like(log(sm)) # Check z is zeros_like(log(sm))
if not _is_const(z, 0): if not _is_const(z, 0):
return return
if z.type not in (dmatrix, fmatrix): if z.broadcastable != (False, False):
if not (vector_softmax and z.broadcastable == (True, False)): if not (vector_softmax and z.broadcastable == (True, False)):
return return
# here we know that we are incrementing a matrix of zeros # here we know that we are incrementing a matrix of zeros
...@@ -1758,14 +1758,15 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1758,14 +1758,15 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes: if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes:
return return
# here we know that we are incrementing some part of matrix z by a vector # here we know that we are incrementing some part of
# matrix z by a vector
# unless the user has taken care to mark that the data and labels have the # unless the user has taken care to mark that the data and
# same number of rows, we cannot be sure here that # labels have the same number of rows, we cannot be sure
# len(y) == len(z) # here that len(y) == len(z) However, in the common case
# However, in the common case that these are predictions and labels it is true. # that these are predictions and labels it is true. We
# We leave it to the Op to crash (and the user to complain) if this assumption is # leave it to the Op to crash (and the user to complain)
# ever not true. # if this assumption is ever not true.
out_grad = -incr out_grad = -incr
......
...@@ -1564,6 +1564,7 @@ class Assert(T.Op): ...@@ -1564,6 +1564,7 @@ class Assert(T.Op):
used in the function computing the graph, but it doesn't have to be used in the function computing the graph, but it doesn't have to be
returned. returned.
""" """
__props__ = ('msg',)
view_map = {0: [0]} view_map = {0: [0]}
check_input = False check_input = False
...@@ -1583,24 +1584,18 @@ class Assert(T.Op): ...@@ -1583,24 +1584,18 @@ class Assert(T.Op):
assert numpy.all([c.type.ndim == 0 for c in cond]) assert numpy.all([c.type.ndim == 0 for c in cond])
return gof.Apply(self, [value] + cond, [value.type()]) return gof.Apply(self, [value] + cond, [value.type()])
def __str__(self):
return self.__class__.__name__
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_):
out, = out_ out, = out_
v = inputs[0] v = inputs[0]
out[0] = v out[0] = v
assert numpy.all(inputs[1:]), self.msg assert numpy.all(inputs[1:]), self.msg
def __eq__(self, other):
return type(self) == type(other) and self.msg == other.msg
def __hash__(self):
return hash(type(self)) ^ hash(self.msg)
def grad(self, input, output_gradients): def grad(self, input, output_gradients):
return output_gradients + [DisconnectedType()()] * (len(input) - 1) return output_gradients + [DisconnectedType()()] * (len(input) - 1)
def connection_pattern(self, node):
return [[1]] + [[0]] * (len(node.inputs) - 1)
def c_code(self, node, name, inames, onames, sub): def c_code(self, node, name, inames, onames, sub):
value = inames[0] value = inames[0]
out = onames[0] out = onames[0]
......
...@@ -5820,40 +5820,24 @@ def _test_autocast_custom(): ...@@ -5820,40 +5820,24 @@ def _test_autocast_custom():
orig_autocast = autocast_float.dtypes orig_autocast = autocast_float.dtypes
# Test that autocast_float_as sets the autocast dtype correctly # Test that autocast_float_as sets the autocast dtype correctly
try: # ghetto 2.4 version of with with autocast_float_as('float32'):
ac = autocast_float_as('float32')
ac.__enter__()
assert autocast_float.dtypes == ('float32',) assert autocast_float.dtypes == ('float32',)
finally:
ac.__exit__()
assert autocast_float.dtypes == orig_autocast assert autocast_float.dtypes == orig_autocast
try: # ghetto 2.4 version of with
ac = autocast_float_as('float64') with autocast_float_as('float64'):
ac.__enter__()
assert autocast_float.dtypes == ('float64',) assert autocast_float.dtypes == ('float64',)
finally:
ac.__exit__()
assert autocast_float.dtypes == orig_autocast assert autocast_float.dtypes == orig_autocast
# Test that we can set it back to something, and nest it # Test that we can set it back to something, and nest it
try: # ghetto 2.4 version of with with autocast_float_as('float32'):
ac = autocast_float_as('float32')
ac.__enter__()
assert autocast_float.dtypes == ('float32',) assert autocast_float.dtypes == ('float32',)
try: # ghetto 2.4 version of with with autocast_float_as('float64'):
ac2 = autocast_float_as('float64')
ac2.__enter__()
assert autocast_float.dtypes == ('float64',) assert autocast_float.dtypes == ('float64',)
finally:
ac2.__exit__()
assert autocast_float.dtypes == ('float32',) assert autocast_float.dtypes == ('float32',)
finally:
ac.__exit__()
assert autocast_float.dtypes == orig_autocast assert autocast_float.dtypes == orig_autocast
# Test that the autocasting dtype is used correctly in expression-building # Test that the autocasting dtype is used correctly in expression-building
try: # ghetto 2.4 version of with with autocast_float_as('float32'):
ac = autocast_float_as('float32')
ac.__enter__()
assert (dvector() + 1.1).dtype == 'float64' assert (dvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.1).dtype == 'float32' assert (fvector() + 1.1).dtype == 'float32'
assert (fvector() + theano._asarray(1.1, dtype='float64')).dtype == \ assert (fvector() + theano._asarray(1.1, dtype='float64')).dtype == \
...@@ -5863,13 +5847,9 @@ def _test_autocast_custom(): ...@@ -5863,13 +5847,9 @@ def _test_autocast_custom():
assert (dvector() + 1).dtype == 'float64' assert (dvector() + 1).dtype == 'float64'
assert (fvector() + 1).dtype == 'float32' assert (fvector() + 1).dtype == 'float32'
finally:
ac.__exit__()
# Test that the autocasting dtype is used correctly in expression-building # Test that the autocasting dtype is used correctly in expression-building
try: # ghetto 2.4 version of with with autocast_float_as('float64'):
ac = autocast_float_as('float64')
ac.__enter__()
assert (dvector() + 1.1).dtype == 'float64' assert (dvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.1).dtype == 'float64' assert (fvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.0).dtype == 'float64' assert (fvector() + 1.0).dtype == 'float64'
...@@ -5880,13 +5860,9 @@ def _test_autocast_custom(): ...@@ -5880,13 +5860,9 @@ def _test_autocast_custom():
assert (dvector() + 1).dtype == 'float64' assert (dvector() + 1).dtype == 'float64'
assert (fvector() + 1).dtype == 'float32' assert (fvector() + 1).dtype == 'float32'
finally:
ac.__exit__()
# Test that the autocasting dtype is used correctly in expression-building # Test that the autocasting dtype is used correctly in expression-building
try: # ghetto 2.4 version of with with autocast_float_as('float32', 'float64'):
ac = autocast_float_as('float32', 'float64')
ac.__enter__()
assert (dvector() + 1.1).dtype == 'float64' assert (dvector() + 1.1).dtype == 'float64'
assert (fvector() + 1.1).dtype == theano.config.floatX assert (fvector() + 1.1).dtype == theano.config.floatX
assert (fvector() + 1.0).dtype == 'float32' assert (fvector() + 1.0).dtype == 'float32'
...@@ -5903,14 +5879,8 @@ def _test_autocast_custom(): ...@@ -5903,14 +5879,8 @@ def _test_autocast_custom():
assert (ivector() + numpy.int8(1)).dtype == 'int32' assert (ivector() + numpy.int8(1)).dtype == 'int32'
assert (wvector() + numpy.int8(1)).dtype == 'int16' assert (wvector() + numpy.int8(1)).dtype == 'int16'
assert (bvector() + numpy.int8(1)).dtype == 'int8' assert (bvector() + numpy.int8(1)).dtype == 'int8'
try: # ghetto 2.4 version of with with autocast_float_as('float64'):
ac2 = autocast_float_as('float64')
ac2.__enter__()
assert (fvector() + 1.0).dtype == 'float64' assert (fvector() + 1.0).dtype == 'float64'
finally:
ac2.__exit__()
finally:
ac.__exit__()
def _test_autocast_numpy(): def _test_autocast_numpy():
...@@ -6036,17 +6006,8 @@ class test_arithmetic_cast(unittest.TestCase): ...@@ -6036,17 +6006,8 @@ class test_arithmetic_cast(unittest.TestCase):
config.int_division == 'raise') config.int_division == 'raise')
# This is the expected behavior. # This is the expected behavior.
continue continue
# For numpy we have a problem: numpy_dtype = op(numpy_args[0](a_type),
# http://projects.scipy.org/numpy/ticket/1827 numpy_args[1](b_type)).dtype
# As a result we only consider the highest data
# type that numpy may return.
numpy_dtypes = [
op(numpy_args[0](a_type),
numpy_args[1](b_type)).dtype,
op(numpy_args[1](b_type),
numpy_args[0](a_type)).dtype]
numpy_dtype = theano.scalar.upcast(
*map(str, numpy_dtypes))
if numpy_dtype == theano_dtype: if numpy_dtype == theano_dtype:
# Same data type found, all is good! # Same data type found, all is good!
continue continue
...@@ -6078,9 +6039,7 @@ class test_arithmetic_cast(unittest.TestCase): ...@@ -6078,9 +6039,7 @@ class test_arithmetic_cast(unittest.TestCase):
# Theano upcasted the result array. # Theano upcasted the result array.
theano_dtype == up_type and theano_dtype == up_type and
# But Numpy kept its original type. # But Numpy kept its original type.
# (not an equality because of numpy bug array_type == numpy_dtype):
# mentioned above).
array_type in numpy_dtypes):
# Then we accept this difference in # Then we accept this difference in
# behavior. # behavior.
continue continue
...@@ -6092,17 +6051,20 @@ class test_arithmetic_cast(unittest.TestCase): ...@@ -6092,17 +6051,20 @@ class test_arithmetic_cast(unittest.TestCase):
numpy.__version__.split('.')[:2]] numpy.__version__.split('.')[:2]]
if (cfg == 'numpy+floatX' and if (cfg == 'numpy+floatX' and
a_type == 'complex128' and a_type == 'complex128' and
b_type == 'float32' and (b_type == 'float32' or
b_type == 'float16') and
combo == ('scalar', 'array') and combo == ('scalar', 'array') and
bool(numpy_version >= [1, 6]) and bool(numpy_version >= [1, 6]) and
theano_dtype == 'complex128' and theano_dtype == 'complex128' and
numpy_dtypes == ['complex64', numpy_dtype == 'complex64'):
'complex64']): # In numpy 1.6.x adding a
# In numpy 1.6.x adding a complex128 with # complex128 with a float32 or
# a float32 may result in a complex64. This # float16 may result in a
# may be a bug (investigation is currently # complex64. This may be a bug
# in progress), so in the meantime we just # (investigation is currently in
# mark this test as a known failure. # progress), so in the meantime we
# just mark this test as a known
# failure.
raise KnownFailureTest('Known issue with ' raise KnownFailureTest('Known issue with '
'numpy >= 1.6.x see #761') 'numpy >= 1.6.x see #761')
......
...@@ -1024,6 +1024,7 @@ class T_prod_without_zeros_dtype(unittest.TestCase): ...@@ -1024,6 +1024,7 @@ class T_prod_without_zeros_dtype(unittest.TestCase):
uint8='uint64', uint8='uint64',
uint16='uint64', uint16='uint64',
uint32='uint64', uint32='uint64',
float16='float32',
float32='float64', float32='float64',
complex64='complex128' complex64='complex128'
).get(dtype, dtype) ).get(dtype, dtype)
......
...@@ -234,6 +234,7 @@ class TensorType(Type): ...@@ -234,6 +234,7 @@ class TensorType(Type):
# complex64, etc. # complex64, etc.
try: try:
return { return {
'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
'float32': (float, 'npy_float32', 'NPY_FLOAT32'), 'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
'float64': (float, 'npy_float64', 'NPY_FLOAT64'), 'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
'uint8': (int, 'npy_uint8', 'NPY_UINT8'), 'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
......
...@@ -6,6 +6,7 @@ from nose.plugins.skip import SkipTest ...@@ -6,6 +6,7 @@ from nose.plugins.skip import SkipTest
import os import os
from fnmatch import fnmatch from fnmatch import fnmatch
import theano import theano
from theano.compat import PY3
try: try:
import flake8.engine import flake8.engine
import flake8.main import flake8.main
...@@ -227,7 +228,6 @@ whitelist_flake8 = [ ...@@ -227,7 +228,6 @@ whitelist_flake8 = [
"sandbox/gpuarray/elemwise.py", "sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py", "sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/opt.py",
"sandbox/gpuarray/blas.py", "sandbox/gpuarray/blas.py",
"sandbox/gpuarray/kernel_codegen.py", "sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py", "sandbox/gpuarray/conv.py",
...@@ -347,6 +347,8 @@ def test_format_flake8(): ...@@ -347,6 +347,8 @@ def test_format_flake8():
""" """
if not flake8_available: if not flake8_available:
raise SkipTest("flake8 is not installed") raise SkipTest("flake8 is not installed")
if PY3:
raise SkipTest("not testing in python3 since 2to3 ran")
total_errors = 0 total_errors = 0
for path in list_files(): for path in list_files():
rel_path = os.path.relpath(path, theano.__path__[0]) rel_path = os.path.relpath(path, theano.__path__[0])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论