提交 33eafac3 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5806 from nouiz/gammaln

make work Gammaln on the new gpu back-end
...@@ -9,6 +9,6 @@ else ...@@ -9,6 +9,6 @@ else
fi fi
source activate pyenv source activate pyenv
if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx mkl-service libgfortran=1; fi if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx=1.5.1 mkl-service libgfortran=1; fi
if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx mkl-service; fi if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx=1.5.1 mkl-service; fi
source deactivate source deactivate
...@@ -1796,6 +1796,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False, ...@@ -1796,6 +1796,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
if isinstance(mode, (list, tuple)): # "mode comparison" semantics if isinstance(mode, (list, tuple)): # "mode comparison" semantics
raise Exception("We do not support the passing of multiple modes") raise Exception("We do not support the passing of multiple modes")
fn = None
try: try:
Maker = getattr(mode, 'function_maker', FunctionMaker) Maker = getattr(mode, 'function_maker', FunctionMaker)
fn = Maker(inputs, fn = Maker(inputs,
...@@ -1808,7 +1809,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False, ...@@ -1808,7 +1809,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
defaults) defaults)
finally: finally:
t2 = time.time() t2 = time.time()
if profile: if fn and profile:
profile.compile_time += t2 - t1 profile.compile_time += t2 - t1
# TODO: append # TODO: append
profile.nb_nodes = len(fn.maker.fgraph.apply_nodes) profile.nb_nodes = len(fn.maker.fgraph.apply_nodes)
......
...@@ -89,6 +89,7 @@ def _atexit_print_fn(): ...@@ -89,6 +89,7 @@ def _atexit_print_fn():
# merge dictonary # merge dictonary
for attr in ["apply_time", "apply_callcount", for attr in ["apply_time", "apply_callcount",
"apply_cimpl", "variable_shape", "variable_strides", "apply_cimpl", "variable_shape", "variable_strides",
"variable_offset",
"linker_make_thunk_time"]: "linker_make_thunk_time"]:
cum_attr = getattr(cum, attr) cum_attr = getattr(cum, attr)
for key, val in iteritems(getattr(ps, attr)): for key, val in iteritems(getattr(ps, attr)):
...@@ -229,6 +230,10 @@ class ProfileStats(object): ...@@ -229,6 +230,10 @@ class ProfileStats(object):
# Variable -> strides # Variable -> strides
# #
variable_offset = {}
# Variable -> offset
#
optimizer_time = 0.0 optimizer_time = 0.0
# time spent optimizing graph (FunctionMaker.__init__) # time spent optimizing graph (FunctionMaker.__init__)
...@@ -295,6 +300,7 @@ class ProfileStats(object): ...@@ -295,6 +300,7 @@ class ProfileStats(object):
self.apply_cimpl = {} self.apply_cimpl = {}
self.variable_shape = {} self.variable_shape = {}
self.variable_strides = {} self.variable_strides = {}
self.variable_offset = {}
if flag_time_thunks is None: if flag_time_thunks is None:
self.flag_time_thunks = config.profiling.time_thunks self.flag_time_thunks = config.profiling.time_thunks
else: else:
...@@ -697,15 +703,21 @@ class ProfileStats(object): ...@@ -697,15 +703,21 @@ class ProfileStats(object):
for idx, var in enumerate(a.inputs): for idx, var in enumerate(a.inputs):
sh = self.variable_shape.get(var, 'no shape') sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides') st = self.variable_strides.get(var, 'no strides')
off = self.variable_offset.get(var, '')
if off != '':
off = ", offset=%s" % off
dtype = getattr(var, 'dtype', 'no dtype') dtype = getattr(var, 'dtype', 'no dtype')
print(" input %d: dtype=%s, shape=%s, strides=%s " % ( print(" input %d: dtype=%s, shape=%s, strides=%s%s" % (
idx, dtype, sh, st), file=file) idx, dtype, sh, st, off), file=file)
for idx, var in enumerate(a.outputs): for idx, var in enumerate(a.outputs):
sh = self.variable_shape.get(var, 'no shape') sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides') st = self.variable_strides.get(var, 'no strides')
off = self.variable_offset.get(var, '')
if off != '':
off = ", offset=%s" % off
dtype = getattr(var, 'dtype', 'no dtype') dtype = getattr(var, 'dtype', 'no dtype')
print(" output %d: dtype=%s, shape=%s, strides=%s " % ( print(" output %d: dtype=%s, shape=%s, strides=%s%s" % (
idx, dtype, sh, st), file=file) idx, dtype, sh, st, off), file=file)
# Same as before, this I've sacrificied some information making # Same as before, this I've sacrificied some information making
# the output more readable # the output more readable
print(' ... (remaining %i Apply instances account for ' print(' ... (remaining %i Apply instances account for '
......
...@@ -207,6 +207,7 @@ class VM(object): ...@@ -207,6 +207,7 @@ class VM(object):
if hasattr(self, 'variable_shape'): if hasattr(self, 'variable_shape'):
profile.variable_shape = self.variable_shape.copy() profile.variable_shape = self.variable_shape.copy()
profile.variable_strides = self.variable_strides.copy() profile.variable_strides = self.variable_strides.copy()
profile.variable_offset = self.variable_offset.copy()
if hasattr(self, 'node_executed_order'): if hasattr(self, 'node_executed_order'):
profile.node_executed_order = self.node_executed_order[:] profile.node_executed_order = self.node_executed_order[:]
...@@ -342,6 +343,7 @@ class Stack(VM): ...@@ -342,6 +343,7 @@ class Stack(VM):
self.storage_map = storage_map self.storage_map = storage_map
self.variable_shape = {} # Variable -> shape self.variable_shape = {} # Variable -> shape
self.variable_strides = {} # Variable -> strides self.variable_strides = {} # Variable -> strides
self.variable_offset = {} # Variable -> offset
self.compute_map = compute_map self.compute_map = compute_map
self.node_idx = node_idx = {} self.node_idx = node_idx = {}
self.callback = callback self.callback = callback
...@@ -436,15 +438,17 @@ class Stack(VM): ...@@ -436,15 +438,17 @@ class Stack(VM):
if hasattr(var.type, 'get_shape_info'): if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(data[0]) sh = var.type.get_shape_info(data[0])
else: else:
sh = 'input no shape' sh = 'no shape'
self.variable_shape[var] = sh self.variable_shape[var] = sh
st = getattr(data[0], 'strides', 'input no strides') st = getattr(data[0], 'strides', 'no strides')
if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous: if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous:
st = 'c' st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()): data[0].is_c_contiguous()):
st = "c" st = "c"
self.variable_strides[var] = st self.variable_strides[var] = st
off = getattr(data[0], 'offset', '')
self.variable_offset[var] = off
while apply_stack: while apply_stack:
# Make sure something happened last time round. This is # Make sure something happened last time round. This is
...@@ -495,17 +499,19 @@ class Stack(VM): ...@@ -495,17 +499,19 @@ class Stack(VM):
if hasattr(var.type, 'get_shape_info'): if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(o[0]) sh = var.type.get_shape_info(o[0])
else: else:
sh = 'input no shape' sh = 'no shape'
self.variable_shape[var] = sh self.variable_shape[var] = sh
st = getattr(o[0], 'strides', st = getattr(o[0], 'strides',
'input no strides') 'no strides')
if (getattr(o[0], 'flags', False) and if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous): o[0].flags.c_contiguous):
st = 'c' st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and elif (hasattr(o[0], 'is_c_contiguous') and
data[0].is_c_contiguous()): o[0].is_c_contiguous()):
st = "c" st = "c"
self.variable_strides[var] = st self.variable_strides[var] = st
off = getattr(o[0], 'offset', '')
self.variable_offset[var] = off
except Exception: except Exception:
link.raise_with_op( link.raise_with_op(
current_apply, current_apply,
...@@ -604,16 +610,18 @@ class Stack(VM): ...@@ -604,16 +610,18 @@ class Stack(VM):
if hasattr(var.type, 'get_shape_info'): if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(o[0]) sh = var.type.get_shape_info(o[0])
else: else:
sh = 'input no shape' sh = 'no shape'
self.variable_shape[var] = sh self.variable_shape[var] = sh
st = getattr(o[0], 'strides', 'input no strides') st = getattr(o[0], 'strides', 'no strides')
if (getattr(o[0], 'flags', False) and if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous): o[0].flags.c_contiguous):
st = 'c' st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and elif (hasattr(o[0], 'is_c_contiguous') and
data[0].is_c_contiguous()): o[0].is_c_contiguous()):
st = "c" st = "c"
self.variable_strides[var] = st self.variable_strides[var] = st
off = getattr(o[0], 'offset', '')
self.variable_offset[var] = off
input_index = [] input_index = []
......
...@@ -97,7 +97,9 @@ def init_dev(dev, name=None): ...@@ -97,7 +97,9 @@ def init_dev(dev, name=None):
# Initialise the blas kernels. We do this after the # Initialise the blas kernels. We do this after the
# preallocation to not fragment the heap accidentally. # preallocation to not fragment the heap accidentally.
tmp = pygpu.empty((2, 2), dtype='float32', context=context) tmp = pygpu.empty((2, 2), dtype='float32', context=context)
pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True) if dev.startswith('cuda'):
# In OpenCL, BLAS isn't always available
pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
del tmp del tmp
else: else:
context = init_dev.devmap[dev] context = init_dev.devmap[dev]
......
...@@ -423,12 +423,11 @@ class GpuGemmBatch(BlasOp): ...@@ -423,12 +423,11 @@ class GpuGemmBatch(BlasOp):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3], vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name) beta=inp[4], inplace=int(self.inplace),
fail=sub['fail'], name=name)
code = """ code = """
int err; int err;
""" if (%(inplace)s){
if self.inplace:
code += """
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) { if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = theano_try_copy(%(out)s, %(C)s); %(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
...@@ -439,15 +438,12 @@ class GpuGemmBatch(BlasOp): ...@@ -439,15 +438,12 @@ class GpuGemmBatch(BlasOp):
%(out)s = %(C)s; %(out)s = %(C)s;
Py_INCREF(%(out)s); Py_INCREF(%(out)s);
} }
""" % vars } else {
else:
code += """
%(out)s = theano_try_copy(%(out)s, %(C)s); %(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
""" % vars }
code += """
err = GpuArray_rgemmBatch_3d( err = GpuArray_rgemmBatch_3d(
cb_no_trans, cb_no_trans, cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
...@@ -467,7 +463,7 @@ class GpuGemmBatch(BlasOp): ...@@ -467,7 +463,7 @@ class GpuGemmBatch(BlasOp):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False) gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True) gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
......
...@@ -2512,6 +2512,7 @@ class GpuErfinv(Erfinv): ...@@ -2512,6 +2512,7 @@ class GpuErfinv(Erfinv):
# For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions # For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions
# to ensure that GPU op returns the same values as CPU op. # to ensure that GPU op returns the same values as CPU op.
return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals() return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals()
gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
class GpuErfcinv(Erfcinv): class GpuErfcinv(Erfcinv):
...@@ -2533,8 +2534,6 @@ class GpuErfcinv(Erfcinv): ...@@ -2533,8 +2534,6 @@ class GpuErfcinv(Erfcinv):
# For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions # For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions
# to ensure that GPU op returns the same values as CPU op. # to ensure that GPU op returns the same values as CPU op.
return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals() return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals()
gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv') gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv')
......
...@@ -711,18 +711,15 @@ def local_gpua_elemwise(op, context_name, inputs, outputs): ...@@ -711,18 +711,15 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
have_opencl = True have_opencl = True
elif kind.startswith(b'cuda'): elif kind.startswith(b'cuda'):
have_cuda = True have_cuda = True
opname = False convert = {Erfinv: gpu_erfinv,
if isinstance(scal_op, Erfinv): Erfcinv: gpu_erfcinv}
opname = 'erfinv'
if have_cuda: if scal_op.__class__ in convert:
scal_op = gpu_erfinv scal_op = convert[scal_op.__class__]
elif isinstance(scal_op, Erfcinv):
opname = 'erfcinv'
if have_cuda:
scal_op = gpu_erfcinv
if opname:
if have_opencl: if have_opencl:
_logger.warning('Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' % opname) _logger.warning(
'Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' %
scal_op)
if not have_cuda: if not have_cuda:
return None return None
res = GpuElemwise(scal_op, name=name, res = GpuElemwise(scal_op, name=name,
......
...@@ -269,10 +269,19 @@ class GammaLn(UnaryScalarOp): ...@@ -269,10 +269,19 @@ class GammaLn(UnaryScalarOp):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
x, = inp x, = inp
z, = out z, = out
if node.inputs[0].type in float_types: # no c code for complex
return """%(z)s = # [u]int* will be casted to float64 before computation
lgamma(%(x)s);""" % locals() if node.inputs[0].type in complex_types:
raise NotImplementedError('only floating point is implemented') raise NotImplementedError(
'gammaln complex c code is not implemented')
# For some reason, on the GPU, uint64 inputs don't get casted
# automatically to float64. This make the compilation crash
dtype = ""
if node.outputs[0].dtype == 'float64':
dtype = "(double)"
elif node.outputs[0].dtype == 'float32':
dtype = "(float)"
return """%(z)s = lgamma(%(dtype)s%(x)s);""" % locals()
gammaln = GammaLn(upgrade_to_float, name='gammaln') gammaln = GammaLn(upgrade_to_float, name='gammaln')
......
...@@ -1807,7 +1807,8 @@ _good_broadcast_unary_gammaln = dict( ...@@ -1807,7 +1807,8 @@ _good_broadcast_unary_gammaln = dict(
empty=(np.asarray([], dtype=config.floatX),), empty=(np.asarray([], dtype=config.floatX),),
int=(randint_ranged(1, 10, (2, 3)),), int=(randint_ranged(1, 10, (2, 3)),),
uint8=(randint_ranged(1, 6, (2, 3)).astype('uint8'),), uint8=(randint_ranged(1, 6, (2, 3)).astype('uint8'),),
uint16=(randint_ranged(1, 10, (2, 3)).astype('uint16'),)) uint16=(randint_ranged(1, 10, (2, 3)).astype('uint16'),),
uint64=(randint_ranged(1, 10, (2, 3)).astype('uint64'),))
_grad_broadcast_unary_gammaln = dict( _grad_broadcast_unary_gammaln = dict(
# smaller range as our grad method does not estimate it well enough. # smaller range as our grad method does not estimate it well enough.
normal=(rand_ranged(1e-1, 8, (2, 3)),),) normal=(rand_ranged(1e-1, 8, (2, 3)),),)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论