提交 33eafac3 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5806 from nouiz/gammaln

make work Gammaln on the new gpu back-end
......@@ -9,6 +9,6 @@ else
fi
source activate pyenv
if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx mkl-service libgfortran=1; fi
if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx mkl-service; fi
if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx=1.5.1 mkl-service libgfortran=1; fi
if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx=1.5.1 mkl-service; fi
source deactivate
......@@ -1796,6 +1796,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
if isinstance(mode, (list, tuple)): # "mode comparison" semantics
raise Exception("We do not support the passing of multiple modes")
fn = None
try:
Maker = getattr(mode, 'function_maker', FunctionMaker)
fn = Maker(inputs,
......@@ -1808,7 +1809,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
defaults)
finally:
t2 = time.time()
if profile:
if fn and profile:
profile.compile_time += t2 - t1
# TODO: append
profile.nb_nodes = len(fn.maker.fgraph.apply_nodes)
......
......@@ -89,6 +89,7 @@ def _atexit_print_fn():
# merge dictonary
for attr in ["apply_time", "apply_callcount",
"apply_cimpl", "variable_shape", "variable_strides",
"variable_offset",
"linker_make_thunk_time"]:
cum_attr = getattr(cum, attr)
for key, val in iteritems(getattr(ps, attr)):
......@@ -229,6 +230,10 @@ class ProfileStats(object):
# Variable -> strides
#
variable_offset = {}
# Variable -> offset
#
optimizer_time = 0.0
# time spent optimizing graph (FunctionMaker.__init__)
......@@ -295,6 +300,7 @@ class ProfileStats(object):
self.apply_cimpl = {}
self.variable_shape = {}
self.variable_strides = {}
self.variable_offset = {}
if flag_time_thunks is None:
self.flag_time_thunks = config.profiling.time_thunks
else:
......@@ -697,15 +703,21 @@ class ProfileStats(object):
for idx, var in enumerate(a.inputs):
sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides')
off = self.variable_offset.get(var, '')
if off != '':
off = ", offset=%s" % off
dtype = getattr(var, 'dtype', 'no dtype')
print(" input %d: dtype=%s, shape=%s, strides=%s " % (
idx, dtype, sh, st), file=file)
print(" input %d: dtype=%s, shape=%s, strides=%s%s" % (
idx, dtype, sh, st, off), file=file)
for idx, var in enumerate(a.outputs):
sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides')
off = self.variable_offset.get(var, '')
if off != '':
off = ", offset=%s" % off
dtype = getattr(var, 'dtype', 'no dtype')
print(" output %d: dtype=%s, shape=%s, strides=%s " % (
idx, dtype, sh, st), file=file)
print(" output %d: dtype=%s, shape=%s, strides=%s%s" % (
idx, dtype, sh, st, off), file=file)
# Same as before, this I've sacrificied some information making
# the output more readable
print(' ... (remaining %i Apply instances account for '
......
......@@ -207,6 +207,7 @@ class VM(object):
if hasattr(self, 'variable_shape'):
profile.variable_shape = self.variable_shape.copy()
profile.variable_strides = self.variable_strides.copy()
profile.variable_offset = self.variable_offset.copy()
if hasattr(self, 'node_executed_order'):
profile.node_executed_order = self.node_executed_order[:]
......@@ -342,6 +343,7 @@ class Stack(VM):
self.storage_map = storage_map
self.variable_shape = {} # Variable -> shape
self.variable_strides = {} # Variable -> strides
self.variable_offset = {} # Variable -> offset
self.compute_map = compute_map
self.node_idx = node_idx = {}
self.callback = callback
......@@ -436,15 +438,17 @@ class Stack(VM):
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(data[0])
else:
sh = 'input no shape'
sh = 'no shape'
self.variable_shape[var] = sh
st = getattr(data[0], 'strides', 'input no strides')
st = getattr(data[0], 'strides', 'no strides')
if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous:
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
st = "c"
self.variable_strides[var] = st
off = getattr(data[0], 'offset', '')
self.variable_offset[var] = off
while apply_stack:
# Make sure something happened last time round. This is
......@@ -495,17 +499,19 @@ class Stack(VM):
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(o[0])
else:
sh = 'input no shape'
sh = 'no shape'
self.variable_shape[var] = sh
st = getattr(o[0], 'strides',
'input no strides')
'no strides')
if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous):
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
elif (hasattr(o[0], 'is_c_contiguous') and
o[0].is_c_contiguous()):
st = "c"
self.variable_strides[var] = st
off = getattr(o[0], 'offset', '')
self.variable_offset[var] = off
except Exception:
link.raise_with_op(
current_apply,
......@@ -604,16 +610,18 @@ class Stack(VM):
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(o[0])
else:
sh = 'input no shape'
sh = 'no shape'
self.variable_shape[var] = sh
st = getattr(o[0], 'strides', 'input no strides')
st = getattr(o[0], 'strides', 'no strides')
if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous):
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
elif (hasattr(o[0], 'is_c_contiguous') and
o[0].is_c_contiguous()):
st = "c"
self.variable_strides[var] = st
off = getattr(o[0], 'offset', '')
self.variable_offset[var] = off
input_index = []
......
......@@ -97,7 +97,9 @@ def init_dev(dev, name=None):
# Initialise the blas kernels. We do this after the
# preallocation to not fragment the heap accidentally.
tmp = pygpu.empty((2, 2), dtype='float32', context=context)
pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
if dev.startswith('cuda'):
# In OpenCL, BLAS isn't always available
pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
del tmp
else:
context = init_dev.devmap[dev]
......
......@@ -423,12 +423,11 @@ class GpuGemmBatch(BlasOp):
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
beta=inp[4], inplace=int(self.inplace),
fail=sub['fail'], name=name)
code = """
int err;
"""
if self.inplace:
code += """
if (%(inplace)s){
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
......@@ -439,15 +438,12 @@ class GpuGemmBatch(BlasOp):
%(out)s = %(C)s;
Py_INCREF(%(out)s);
}
""" % vars
else:
code += """
} else {
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
}
err = GpuArray_rgemmBatch_3d(
cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
......@@ -467,7 +463,7 @@ class GpuGemmBatch(BlasOp):
return code
def c_code_cache_version(self):
return (1,)
return (2,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
......
......@@ -2512,6 +2512,7 @@ class GpuErfinv(Erfinv):
# For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions
# to ensure that GPU op returns the same values as CPU op.
return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals()
gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
class GpuErfcinv(Erfcinv):
......@@ -2533,8 +2534,6 @@ class GpuErfcinv(Erfcinv):
# For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions
# to ensure that GPU op returns the same values as CPU op.
return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals()
gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv')
......
......@@ -711,18 +711,15 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
have_opencl = True
elif kind.startswith(b'cuda'):
have_cuda = True
opname = False
if isinstance(scal_op, Erfinv):
opname = 'erfinv'
if have_cuda:
scal_op = gpu_erfinv
elif isinstance(scal_op, Erfcinv):
opname = 'erfcinv'
if have_cuda:
scal_op = gpu_erfcinv
if opname:
convert = {Erfinv: gpu_erfinv,
Erfcinv: gpu_erfcinv}
if scal_op.__class__ in convert:
scal_op = convert[scal_op.__class__]
if have_opencl:
_logger.warning('Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' % opname)
_logger.warning(
'Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' %
scal_op)
if not have_cuda:
return None
res = GpuElemwise(scal_op, name=name,
......
......@@ -269,10 +269,19 @@ class GammaLn(UnaryScalarOp):
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
if node.inputs[0].type in float_types:
return """%(z)s =
lgamma(%(x)s);""" % locals()
raise NotImplementedError('only floating point is implemented')
# no c code for complex
# [u]int* will be casted to float64 before computation
if node.inputs[0].type in complex_types:
raise NotImplementedError(
'gammaln complex c code is not implemented')
# For some reason, on the GPU, uint64 inputs don't get casted
# automatically to float64. This make the compilation crash
dtype = ""
if node.outputs[0].dtype == 'float64':
dtype = "(double)"
elif node.outputs[0].dtype == 'float32':
dtype = "(float)"
return """%(z)s = lgamma(%(dtype)s%(x)s);""" % locals()
gammaln = GammaLn(upgrade_to_float, name='gammaln')
......
......@@ -1807,7 +1807,8 @@ _good_broadcast_unary_gammaln = dict(
empty=(np.asarray([], dtype=config.floatX),),
int=(randint_ranged(1, 10, (2, 3)),),
uint8=(randint_ranged(1, 6, (2, 3)).astype('uint8'),),
uint16=(randint_ranged(1, 10, (2, 3)).astype('uint16'),))
uint16=(randint_ranged(1, 10, (2, 3)).astype('uint16'),),
uint64=(randint_ranged(1, 10, (2, 3)).astype('uint64'),))
_grad_broadcast_unary_gammaln = dict(
# smaller range as our grad method does not estimate it well enough.
normal=(rand_ranged(1e-1, 8, (2, 3)),),)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论