Fix remaining problems.

c4c7b31d · Arnaud Bergeron · 86be5809 · c4c7b31d
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -18,8 +18,6 @@ try:
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass
-cusolver_handle = None
 class GpuCusolverSolve(Op):
    """
@@ -32,7 +30,7 @@ class GpuCusolverSolve(Op):
    """
-    __props__ = ('trans',)
+    __props__ = ('trans', 'inplace')
    def __init__(self, trans='N', inplace=False):
        self.trans = trans
@@ -42,10 +40,13 @@ class GpuCusolverSolve(Op):
        super(GpuCusolverSolve, self).__init__()
    def make_node(self, inp1, inp2):
-        self.context = basic_ops.infer_context_name(inp1, inp2)
+        if not cusolver_available:
+            raise RuntimeError('CUSOLVER is not available and '
+                               'GpuCusolverSolve Op can not be constructed.')
+        context_name = basic_ops.infer_context_name(inp1, inp2)
-        inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
+        inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
-        inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
+        inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)
        inp1 = basic_ops.gpu_contiguous(inp1)
        inp2 = basic_ops.gpu_contiguous(inp2)
@@ -62,110 +63,84 @@ class GpuCusolverSolve(Op):
                          broadcastable=inp1.broadcastable,
                          context_name=self.context)()])
-    def make_thunk(self,
+    def prepare_node(self, node, storage_map, compute_map, impl):
-                   node,
+        ctx = node.inputs[0].type.context
-                   storage_map, _,
+        handle = getattr(ctx, 'cusolver_handle', None)
-                   no_recycling=[],
+        if handle is None:
-                   impl=None):
+            with ctx:
-        if not cusolver_available:
+                ctx.cusolver_handle = cusolver.cusolverDnCreate()
-            raise RuntimeError('CUSOLVER is not available and '
-                               'GpuCusolverSolve Op can not be constructed.')
+    def perform(self, node, inputs, outputs):
+        context = inputs[0][0].context
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
+        # Size of the matrices to invert.
+        z = outputs[0]
-        global cusolver_handle
-        if cusolver_handle is None:
+        # Matrix.
-            cusolver_handle = cusolver.cusolverDnCreate()
+        A = inputs[0]
-        def thunk():
+        # Solution vectors.
-            context = inputs[0][0].context
+        b = inputs[1]
-            # Size of the matrices to invert.
+        assert(len(A.shape) == 2)
-            z = outputs[0]
+        assert(len(b.shape) == 2)
-            # Matrix.
+        if self.trans in ['T', 'C']:
-            A = inputs[0][0]
+            trans = 1
+            l, n = A.shape
-            # Solution vectors.
+            k, m = b.shape
-            b = inputs[1][0]
+        elif self.trans == 'N':
+            trans = 0
-            assert(len(A.shape) == 2)
+            n, l = A.shape
-            assert(len(b.shape) == 2)
+            k, m = b.shape
+        else:
-            if self.trans in ['T', 'C']:
+            raise ValueError('Invalid value for trans')
-                trans = 1
+        if l != n:
-                l, n = A.shape
+            raise ValueError('A must be a square matrix')
-                k, m = b.shape
+        if n != k:
-            elif self.trans == 'N':
+            raise ValueError('A and b must be aligned.')
-                trans = 0
-                n, l = A.shape
+        lda = max(1, n)
-                k, m = b.shape
+        ldb = max(1, k)
-            else:
-                raise ValueError('Invalid value for trans')
+        # We copy A and b as cusolver operates inplace
-            if l != n:
+        b = pygpu.array(b, copy=True, order='F')
-                raise ValueError('A must be a square matrix')
+        if not self.inplace:
-            if n != k:
+            A = pygpu.array(A, copy=True)
-                raise ValueError('A and b must be aligned.')
+        A_ptr = A.gpudata
+        b_ptr = b.gpudata
-            lda = max(1, n)
-            ldb = max(1, k)
+        # cusolver expects a F ordered matrix, but A is not explicitly
+        # converted between C and F order, instead we switch the
-            # We copy A and b as cusolver operates inplace
+        # "transpose" flag.
-            b = pygpu.array(b, copy=True, order='F')
+        if A.flags['C_CONTIGUOUS']:
-            if not self.inplace:
+            trans = 1 - trans
-                A = pygpu.array(A, copy=True)
-            A_ptr = A.gpudata
+        with context:
-            b_ptr = b.gpudata
-            # cusolver expects a F ordered matrix, but A is not explicitly
-            # converted between C and F order, instead we switch the
-            # "transpose" flag.
-            if A.flags['C_CONTIGUOUS']:
-                trans = 1 - trans
            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
-                cusolver_handle, n, n, A_ptr, lda)
+                context.cusolver_handle, n, n, A_ptr, lda)
-            if (thunk.workspace is None or
+        workspace = pygpu.zeros(workspace_size, dtype='float32',
-                    thunk.workspace.size != workspace_size):
+                                context=context)
-                thunk.workspace = pygpu.zeros(workspace_size,
-                                              dtype='float32',
-                                              context=context)
-            if thunk.pivots is None or thunk.pivots.size != min(n, n):
+        pivots = pygpu.zeros(n, dtype='int32', context=context)
-                thunk.pivots = pygpu.zeros(n,
-                                           dtype='int32',
-                                           context=context)
-            if thunk.dev_info is None:
+        dev_info = pygpu.zeros((1,), dtype='int32', context=context)
-                thunk.dev_info = pygpu.zeros((1,),
-                                             dtype='int32',
-                                             context=context)
-            workspace_ptr = thunk.workspace.gpudata
+        workspace_ptr = workspace.gpudata
-            pivots_ptr = thunk.pivots.gpudata
+        pivots_ptr = pivots.gpudata
-            dev_info_ptr = thunk.dev_info.gpudata
+        dev_info_ptr = dev_info.gpudata
+        with context:
            cusolver.cusolverDnSgetrf(
-                cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
+                context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                pivots_ptr, dev_info_ptr)
            cusolver.cusolverDnSgetrs(
-                cusolver_handle, trans, n, m, A_ptr, lda,
+                context.cusolver_handle, trans, n, m, A_ptr, lda,
                pivots_ptr, b_ptr, ldb, dev_info_ptr)
-            z[0] = b
+        z[0] = b
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        thunk.workspace = None
-        thunk.pivots = None
-        thunk.dev_info = None
-        return thunk
 def gpu_solve(A, b, trans='N'):