Merge pull request #5806 from nouiz/gammaln

make work Gammaln on the new gpu back-end

Merge pull request #5806 from nouiz/gammaln
33eafac3 · Frédéric Bastien · GitHub · bea31470 · 33bb44c8 · 33eafac3
--- a/.travis/travis_install.sh
+++ b/.travis/travis_install.sh
@@ -9,6 +9,6 @@ else
 fi
 source activate pyenv
-if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx mkl-service libgfortran=1; fi
+if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx=1.5.1 mkl-service libgfortran=1; fi
-if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx mkl-service; fi
+if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda install --yes -q mkl numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx=1.5.1 mkl-service; fi
 source deactivate
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1796,6 +1796,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
    if isinstance(mode, (list, tuple)):  # "mode comparison" semantics
        raise Exception("We do not support the passing of multiple modes")
+    fn = None
    try:
        Maker = getattr(mode, 'function_maker', FunctionMaker)
        fn = Maker(inputs,
@@ -1808,7 +1809,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
            defaults)
    finally:
        t2 = time.time()
-        if profile:
+        if fn and profile:
            profile.compile_time += t2 - t1
            # TODO: append
            profile.nb_nodes = len(fn.maker.fgraph.apply_nodes)

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -89,6 +89,7 @@ def _atexit_print_fn():
                # merge dictonary
                for attr in ["apply_time", "apply_callcount",
                             "apply_cimpl", "variable_shape", "variable_strides",
+                             "variable_offset",
                             "linker_make_thunk_time"]:
                    cum_attr = getattr(cum, attr)
                    for key, val in iteritems(getattr(ps, attr)):
@@ -229,6 +230,10 @@ class ProfileStats(object):
    # Variable -> strides
    #
+    variable_offset = {}
+    # Variable -> offset
+    #
    optimizer_time = 0.0
    # time spent optimizing graph (FunctionMaker.__init__)
@@ -295,6 +300,7 @@ class ProfileStats(object):
        self.apply_cimpl = {}
        self.variable_shape = {}
        self.variable_strides = {}
+        self.variable_offset = {}
        if flag_time_thunks is None:
            self.flag_time_thunks = config.profiling.time_thunks
        else:
@@ -697,15 +703,21 @@ class ProfileStats(object):
            for idx, var in enumerate(a.inputs):
                sh = self.variable_shape.get(var, 'no shape')
                st = self.variable_strides.get(var, 'no strides')
+                off = self.variable_offset.get(var, '')
+                if off != '':
+                    off = ", offset=%s" % off
                dtype = getattr(var, 'dtype', 'no dtype')
-                print("    input %d: dtype=%s, shape=%s, strides=%s " % (
+                print("    input %d: dtype=%s, shape=%s, strides=%s%s" % (
-                    idx, dtype, sh, st), file=file)
+                    idx, dtype, sh, st, off), file=file)
            for idx, var in enumerate(a.outputs):
                sh = self.variable_shape.get(var, 'no shape')
                st = self.variable_strides.get(var, 'no strides')
+                off = self.variable_offset.get(var, '')
+                if off != '':
+                    off = ", offset=%s" % off
                dtype = getattr(var, 'dtype', 'no dtype')
-                print("    output %d: dtype=%s, shape=%s, strides=%s " % (
+                print("    output %d: dtype=%s, shape=%s, strides=%s%s" % (
-                    idx, dtype, sh, st), file=file)
+                    idx, dtype, sh, st, off), file=file)
            # Same as before, this I've sacrificied some information making
            # the output more readable
        print('   ... (remaining %i Apply instances account for '

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -207,6 +207,7 @@ class VM(object):
        if hasattr(self, 'variable_shape'):
            profile.variable_shape = self.variable_shape.copy()
            profile.variable_strides = self.variable_strides.copy()
+            profile.variable_offset = self.variable_offset.copy()
        if hasattr(self, 'node_executed_order'):
            profile.node_executed_order = self.node_executed_order[:]
@@ -342,6 +343,7 @@ class Stack(VM):
        self.storage_map = storage_map
        self.variable_shape = {}  # Variable -> shape
        self.variable_strides = {}  # Variable -> strides
+        self.variable_offset = {}  # Variable -> offset
        self.compute_map = compute_map
        self.node_idx = node_idx = {}
        self.callback = callback
@@ -436,15 +438,17 @@ class Stack(VM):
            if hasattr(var.type, 'get_shape_info'):
                sh = var.type.get_shape_info(data[0])
            else:
-                sh = 'input no shape'
+                sh = 'no shape'
            self.variable_shape[var] = sh
-            st = getattr(data[0], 'strides', 'input no strides')
+            st = getattr(data[0], 'strides', 'no strides')
            if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous:
                st = 'c'
            elif (hasattr(data[0], 'is_c_contiguous') and
                  data[0].is_c_contiguous()):
                st = "c"
            self.variable_strides[var] = st
+            off = getattr(data[0], 'offset', '')
+            self.variable_offset[var] = off
        while apply_stack:
            # Make sure something happened last time round.  This is
@@ -495,17 +499,19 @@ class Stack(VM):
                                if hasattr(var.type, 'get_shape_info'):
                                    sh = var.type.get_shape_info(o[0])
                                else:
-                                    sh = 'input no shape'
+                                    sh = 'no shape'
                                self.variable_shape[var] = sh
                                st = getattr(o[0], 'strides',
-                                             'input no strides')
+                                             'no strides')
                                if (getattr(o[0], 'flags', False) and
                                        o[0].flags.c_contiguous):
                                    st = 'c'
-                                elif (hasattr(data[0], 'is_c_contiguous') and
+                                elif (hasattr(o[0], 'is_c_contiguous') and
-                                      data[0].is_c_contiguous()):
+                                      o[0].is_c_contiguous()):
                                    st = "c"
                                self.variable_strides[var] = st
+                                off = getattr(o[0], 'offset', '')
+                                self.variable_offset[var] = off
                    except Exception:
                        link.raise_with_op(
                            current_apply,
@@ -604,16 +610,18 @@ class Stack(VM):
                            if hasattr(var.type, 'get_shape_info'):
                                sh = var.type.get_shape_info(o[0])
                            else:
-                                sh = 'input no shape'
+                                sh = 'no shape'
                            self.variable_shape[var] = sh
-                            st = getattr(o[0], 'strides', 'input no strides')
+                            st = getattr(o[0], 'strides', 'no strides')
                            if (getattr(o[0], 'flags', False) and
                                    o[0].flags.c_contiguous):
                                st = 'c'
-                            elif (hasattr(data[0], 'is_c_contiguous') and
+                            elif (hasattr(o[0], 'is_c_contiguous') and
-                                  data[0].is_c_contiguous()):
+                                  o[0].is_c_contiguous()):
                                st = "c"
                            self.variable_strides[var] = st
+                            off = getattr(o[0], 'offset', '')
+                            self.variable_offset[var] = off
                    input_index = []

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -97,7 +97,9 @@ def init_dev(dev, name=None):
        # Initialise the blas kernels.  We do this after the
        # preallocation to not fragment the heap accidentally.
        tmp = pygpu.empty((2, 2), dtype='float32', context=context)
-        pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
+        if dev.startswith('cuda'):
+            # In OpenCL, BLAS isn't always available
+            pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
        del tmp
    else:
        context = init_dev.devmap[dev]

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -423,12 +423,11 @@ class GpuGemmBatch(BlasOp):
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
-                    beta=inp[4], fail=sub['fail'], name=name)
+                    beta=inp[4], inplace=int(self.inplace),
+                    fail=sub['fail'], name=name)
        code = """
        int err;
-        """
+        if (%(inplace)s){
-        if self.inplace:
-            code += """
                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
                     %(out)s = theano_try_copy(%(out)s, %(C)s);
                     if (%(out)s == NULL) {
@@ -439,15 +438,12 @@ class GpuGemmBatch(BlasOp):
                     %(out)s = %(C)s;
                     Py_INCREF(%(out)s);
                   }
-                   """ % vars
+        } else {
-        else:
-            code += """
                   %(out)s = theano_try_copy(%(out)s, %(C)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
-                   """ % vars
+        }
-        code += """
        err = GpuArray_rgemmBatch_3d(
            cb_no_trans, cb_no_trans,
            ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
@@ -467,7 +463,7 @@ class GpuGemmBatch(BlasOp):
        return code
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
 gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
 gpugemmbatch_inplace = GpuGemmBatch(inplace=True)

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -2512,6 +2512,7 @@ class GpuErfinv(Erfinv):
        # For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions
        # to ensure that GPU op returns the same values as CPU op.
        return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals()
+gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
 class GpuErfcinv(Erfcinv):
@@ -2533,8 +2534,6 @@ class GpuErfcinv(Erfcinv):
        # For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions
        # to ensure that GPU op returns the same values as CPU op.
        return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals()
-gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
 gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv')

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -711,18 +711,15 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
            have_opencl = True
        elif kind.startswith(b'cuda'):
            have_cuda = True
-    opname = False
+    convert = {Erfinv: gpu_erfinv,
-    if isinstance(scal_op, Erfinv):
+               Erfcinv: gpu_erfcinv}
-        opname = 'erfinv'
-        if have_cuda:
+    if scal_op.__class__ in convert:
-            scal_op = gpu_erfinv
+        scal_op = convert[scal_op.__class__]
-    elif isinstance(scal_op, Erfcinv):
-        opname = 'erfcinv'
-        if have_cuda:
-            scal_op = gpu_erfcinv
-    if opname:
        if have_opencl:
-            _logger.warning('Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' % opname)
+            _logger.warning(
+                'Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' %
+                scal_op)
        if not have_cuda:
            return None
    res = GpuElemwise(scal_op, name=name,

--- a/theano/scalar/basic_scipy.py
+++ b/theano/scalar/basic_scipy.py
@@ -269,10 +269,19 @@ class GammaLn(UnaryScalarOp):
    def c_code(self, node, name, inp, out, sub):
        x, = inp
        z, = out
-        if node.inputs[0].type in float_types:
+        # no c code for complex
-            return """%(z)s =
+        # [u]int* will be casted to float64 before computation
-                lgamma(%(x)s);""" % locals()
+        if node.inputs[0].type in complex_types:
-        raise NotImplementedError('only floating point is implemented')
+            raise NotImplementedError(
+                'gammaln complex c code is not implemented')
+        # For some reason, on the GPU, uint64 inputs don't get casted
+        # automatically to float64. This make the compilation crash
+        dtype = ""
+        if node.outputs[0].dtype == 'float64':
+            dtype = "(double)"
+        elif node.outputs[0].dtype == 'float32':
+            dtype = "(float)"
+        return """%(z)s = lgamma(%(dtype)s%(x)s);""" % locals()
 gammaln = GammaLn(upgrade_to_float, name='gammaln')

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1807,7 +1807,8 @@ _good_broadcast_unary_gammaln = dict(
    empty=(np.asarray([], dtype=config.floatX),),
    int=(randint_ranged(1, 10, (2, 3)),),
    uint8=(randint_ranged(1, 6, (2, 3)).astype('uint8'),),
-    uint16=(randint_ranged(1, 10, (2, 3)).astype('uint16'),))
+    uint16=(randint_ranged(1, 10, (2, 3)).astype('uint16'),),
+    uint64=(randint_ranged(1, 10, (2, 3)).astype('uint64'),))
 _grad_broadcast_unary_gammaln = dict(
    # smaller range as our grad method does not estimate it well enough.
    normal=(rand_ranged(1e-1, 8, (2, 3)),),)