Addressed nouiz comments.

cf4e0264 · notoraptor · 4c87b1b3 · cf4e0264 · cf4e0264 · cf4e0264
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -5,7 +5,6 @@ import theano
 import warnings
 from theano import Op
 from theano.gpuarray import basic_ops, GpuArrayType
 import numpy as np
@@ -254,7 +253,7 @@ class GpuCholesky(Op):
            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
        if not pygpu_available:
            raise RuntimeError('Missing pygpu or triu/tril functions.'
-                               'Try updating libgpuarray?')
+                               'Install or update libgpuarray.')
        context_name = basic_ops.infer_context_name(inp)
        inp = basic_ops.as_gpuarray_variable(inp, context_name)
@@ -262,14 +261,12 @@ class GpuCholesky(Op):
        inp = basic_ops.gpu_contiguous(inp)
        # this op can only operate on float32 matrices
+        # because of current implementation of triu/tril.
+        # TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
        assert inp.ndim == 2
        assert inp.dtype == 'float32'
-        return theano.Apply(
+        return theano.Apply(self, [inp], [inp.type()])
-            self, [inp],
-            [GpuArrayType('float32',
-                          broadcastable=inp.broadcastable,
-                          context_name=context_name)()])
    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1977,6 +1977,13 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
        return
    return GpuCholesky(lower=op.lower, inplace=op.destructive)
+@register_inplace()
+@local_optimizer([GpuCholesky], inplace=True)
+def local_inplace_cholesky(node):
+    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
+        return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -593,7 +593,8 @@ def test_local_lift_cholesky():
    f_gpu = theano.function([A], o, mode=mode_with_gpu)
    assert not any(isinstance(n.op, slinalg.Cholesky)
                   for n in f_gpu.maker.fgraph.apply_nodes)
-    assert any(isinstance(n.op, GpuCholesky)
+    # GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
+    assert any(isinstance(n.op, GpuCholesky) and n.op.inplace
               for n in f_gpu.maker.fgraph.apply_nodes)
    M_val = np.random.normal(size=(3, 3)).astype("float32")
    # A = M.dot(M) will be positive definite for all non-singular M
@@ -601,6 +602,25 @@ def test_local_lift_cholesky():
    utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
+def test_gpu_cholesky_not_inplace():
+    if not cusolver_available:
+        raise SkipTest('No cuSolver')
+    A = tensor.fmatrix()
+    A_squared = A**2
+    B = slinalg.cholesky(A_squared)
+    D = B + A_squared
+    f_cpu = theano.function([A], D, mode=mode_without_gpu)
+    f_gpu = theano.function([A], D, mode=mode_with_gpu)
+    # GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
+    count_cholesky_not_inplace = len([n.op for n in f_gpu.maker.fgraph.apply_nodes
+                                      if isinstance(n.op, GpuCholesky) and not n.op.inplace])
+    assert count_cholesky_not_inplace == 1, count_cholesky_not_inplace
+    M_val = np.random.normal(size=(3, 3)).astype("float32")
+    # A = M.dot(M) will be positive definite for all non-singular M
+    A_val = M_val.dot(M_val.T)
+    utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
 def test_local_gpua_advanced_incsubtensor():
    # test a corner case reported at gh-5589
    target = tensor.ftensor4()