Make GammaLn work on the GPU

530999f7 · Frederic Bastien · f4c7ecfb · 530999f7 · 530999f7 · 530999f7
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -8,7 +8,7 @@ from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
 from theano.scalar import Scalar, Composite
 from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
-from theano.scalar.basic_scipy import Erfinv, Erfcinv
+from theano.scalar.basic_scipy import Erfinv, Erfcinv, GammaLn
 from theano.scalar.basic import upgrade_to_float_no_complex, complex_types

 try:
@@ -2493,6 +2493,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        return kernels


+class GpuGammaLn(GammaLn):
+
+    def c_headers(self):
+        return ['math_functions.h']
+gpu_gammaln = GpuGammaLn(upgrade_to_float_no_complex, name='gpu_gammaln')
+
+
 class GpuErfinv(Erfinv):
    """
    Inverse error function for GPU.
@@ -2512,6 +2519,7 @@ class GpuErfinv(Erfinv):
        # For consistency of CPU and GPU ops, we wrap the CUDA erfinv in the following conditions
        # to ensure that GPU op returns the same values as CPU op.
        return "%(z)s = (%(x)s <= -1) ? erfinv(-1.0): ((%(x)s >= 1) ? erfinv(1.0): erfinv(%(x)s));" % locals()
+gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')


 class GpuErfcinv(Erfcinv):
@@ -2533,8 +2541,6 @@ class GpuErfcinv(Erfcinv):
        # For consistency of CPU and GPU ops, we wrap the CUDA erfcinv in the following conditions
        # to ensure that GPU op returns the same values as CPU op.
        return "%(z)s = (%(x)s <= 0) ? erfcinv(0.0): ((%(x)s >= 2) ? erfcinv(2.0): erfcinv(%(x)s));" % locals()
-
-gpu_erfinv = GpuErfinv(upgrade_to_float_no_complex, name='gpu_erfinv')
 gpu_erfcinv = GpuErfcinv(upgrade_to_float_no_complex, name='gpu_erfcinv')



--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -19,7 +19,7 @@ from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet

 from theano.scalar.basic import Scalar, Pow, Cast
-from theano.scalar.basic_scipy import Erfinv, Erfcinv
+from theano.scalar.basic_scipy import Erfinv, Erfcinv, GammaLn
 from theano.scan_module import scan_utils, scan_op, scan_opt

 from theano.tensor.nnet import bn
@@ -61,7 +61,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
                   gpu_crossentropy_softmax_argmax_1hot_with_bias,
                   gpu_softmax_with_bias, gpu_softmax)
 from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
-                       GpuCAReduceCPY, gpu_erfinv, gpu_erfcinv,
+                       GpuCAReduceCPY, gpu_erfinv, gpu_erfcinv, gpu_gammaln,
                       max_inputs_to_GpuElemwise)
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedSubtensor,
@@ -711,18 +711,16 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
            have_opencl = True
        elif kind.startswith(b'cuda'):
            have_cuda = True
-    opname = False
-    if isinstance(scal_op, Erfinv):
-        opname = 'erfinv'
-        if have_cuda:
-            scal_op = gpu_erfinv
-    elif isinstance(scal_op, Erfcinv):
-        opname = 'erfcinv'
-        if have_cuda:
-            scal_op = gpu_erfcinv
-    if opname:
+    convert = {Erfinv: gpu_erfinv,
+               Erfcinv: gpu_erfcinv,
+               GammaLn: gpu_gammaln}
+
+    if scal_op.__class__ in convert:
+        scal_op = convert[scal_op.__class__]
        if have_opencl:
-            _logger.warning('Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' % opname)
+            _logger.warning(
+                'Function "%s" is not supported with OpenCL. Use "device=cuda" instead.' %
+                scal_op)
        if not have_cuda:
            return None
    res = GpuElemwise(scal_op, name=name,

--- a/theano/scalar/basic_scipy.py
+++ b/theano/scalar/basic_scipy.py
@@ -271,11 +271,17 @@ class GammaLn(UnaryScalarOp):
        z, = out
        # no c code for complex
        # [u]int* will be casted to float64 before computation
-        if x.type in complex_types:
+        if node.inputs[0].type in complex_types:
            raise NotImplementedError(
                'gammaln complex c code is not implemented')
-        return """%(z)s =
-            lgamma(%(x)s);""" % locals()
+        # For some reason, on the GPU, uint64 inputs don't get casted
+        # automatically to float64. This make the compilation crash
+        dtype = ""
+        if node.outputs[0].dtype == 'float64':
+            dtype = "(double)"
+        elif node.outputs[0].dtype == 'float32':
+            dtype = "(float)"
+        return """%(z)s = lgamma(%(dtype)s%(x)s);""" % locals()
 gammaln = GammaLn(upgrade_to_float, name='gammaln')