Changed op_lifter to accept Op and Inputs

3f9d0601 · sentient07 · e45b6cd6 · 3f9d0601 · 3f9d0601 · 3f9d0601
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1498,35 +1498,35 @@ def local_dnn_convi_output_merge(node, *inputs):
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([Pool])
-def local_pool_dnn_alternative(node, ctx_name):
+def local_pool_dnn_alternative(op, ctx_name, inputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
-    if not node.op.ignore_border:
+    if not op.ignore_border:
        return
-    img, = node.inputs
+    img, = inputs
    img = as_gpuarray_variable(img, ctx_name)
-    ds = node.op.ds
+    ds = op.ds
-    stride = node.op.st
+    stride = op.st
-    pad = node.op.padding
+    pad = op.padding
-    mode = node.op.mode
+    mode = op.mode
    return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode)
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([MaxPoolGrad])
-def local_pool_dnn_grad_stride(node, ctx_name):
+def local_pool_dnn_grad_stride(op, ctx_name, inputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
    if not node.op.ignore_border:
        return
-    inp, out, out_grad = node.inputs
+    inp, out, out_grad = inputs
    inp = as_gpuarray_variable(inp, ctx_name)
    out = as_gpuarray_variable(out, ctx_name)
    out_grad = as_gpuarray_variable(out_grad, ctx_name)
-    ds = node.op.ds
+    ds = op.ds
-    st = node.op.st
+    st = op.st
-    pad = node.op.padding
+    pad = op.padding
-    mode = node.op.mode
+    mode = op.mode
    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
                                     gpu_contiguous(out),
@@ -1538,18 +1538,18 @@ def local_pool_dnn_grad_stride(node, ctx_name):
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([AveragePoolGrad])
-def local_avg_pool_dnn_grad_stride(node, ctx_name):
+def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
-    if not node.op.ignore_border:
+    if not op.ignore_border:
        return
-    inp, out_grad = node.inputs
+    inp, out_grad = inputs
    inp = as_gpuarray_variable(inp, ctx_name)
    out_grad = as_gpuarray_variable(out_grad, ctx_name)
-    ds = node.op.ds
+    ds = op.ds
-    st = node.op.st
+    st = op.st
-    pad = node.op.padding
+    pad = op.padding
-    mode = node.op.mode
+    mode = op.mode
    cg = gpu_contiguous(out_grad)
@@ -1591,9 +1591,9 @@ def local_log_softmax_dnn(node):
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([LogSoftmax])
-def local_logsoftmax_to_dnn(node, ctx_name):
+def local_logsoftmax_to_dnn(op, ctx_name, inputs):
    # Transform the input in the format expected by GpuDnnSoftmax
-    inp = node.inputs[0]
+    inp = inputs[0]
    if inp.ndim != 2:
        return
    if not dnn_available(ctx_name) or version(raises=False) < 3000:
@@ -1629,11 +1629,11 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([SoftmaxGrad])
-def local_softmax_dnn_grad(node, ctx_name):
+def local_softmax_dnn_grad(op, ctx_name, inputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn("cuDNN needed for SoftmaxGrad")
    ins = []
-    for n in node.inputs:
+    for n in inputs:
        n = as_gpuarray_variable(n, ctx_name)
        if n.ndim != 2:
            return

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -452,10 +452,10 @@ class GpuCumsum(GpuKernelBase, Op):
 @op_lifter([CumsumOp])
-def use_gpu_cumsumop(node, ctx_name):
+def use_gpu_cumsumop(op, ctx_name, inputs):
-    if node.inputs[0].dtype == 'float32':
+    if inputs[0].dtype == 'float32':
-        axis = node.op.axis
+        axis = op.axis
-        x = node.inputs[0]
+        x = inputs[0]
        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -229,21 +229,21 @@ KERNEL void k_multi_warp_multinomial(
 @register_opt()
 @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
-def local_gpua_multinomial(node, context_name):
+def local_gpua_multinomial(op, context_name, inputs):
    # TODO : need description for function
-    if len(node.inputs) == 2:
+    if len(inputs) == 2:
-        p, u = node.inputs
+        p, u = inputs
        n_samples = 1
    else:
-        p, u, n_samples = node.inputs
+        p, u, n_samples = inputs
    try:
        if get_scalar_constant_value(n_samples) != 1:
            return None
    except NotScalarConstantError:
        return None
-    m, = node.outputs
+    m, = outputs
    if (p.dtype == u.dtype == m.dtype == 'float32'):
-        gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
+        gpu_op = GPUAMultinomialFromUniform(op.odtype)
        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
            gpu_op(p, u))
--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -469,8 +469,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
 @op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node, context_name):
+def use_gpu_images2neibs(op, context_name, inputs):
-    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
+    if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
-        return GpuImages2Neibs(node.op.mode)
+        return GpuImages2Neibs(op.mode)
 register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
@@ -149,14 +149,14 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
 @opt.register_opt()
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node, ctx_name):
+def local_dot_to_gemm16(op, ctx_name, inputs):
    if nerv is None:
        return
-    A = node.inputs[0]
+    A = inputs[0]
-    B = node.inputs[1]
+    B = inputs[1]
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = node.inputs[0].fgraph
+        fgraph = inputs[0].fgraph
        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -161,9 +161,18 @@ def op_lifter(OP, cuda_only=False):
                        context_name = i.owner.inputs[0].type.context_name
                        replace = True
                        break
+                clients = [c for o in node.outputs for c in o.clients]
+                # list of list containing clients
+                # it is clients per node basis
+                out_clients = []
+                for o in node.outputs:
+                    if o.clients:
+                        out_clients.append(o.clients)
+                    else:
+                        out_clients.append([])
                if not replace:
                    # We replace if *all* clients are on the GPU
-                    clients = [c for o in node.outputs for c in o.clients]
                    replace = len(clients) != 0
                    for c, idx in clients:
                        if (c == 'output' or
@@ -184,10 +193,11 @@ def op_lifter(OP, cuda_only=False):
                # the context was derived from the outputs
                for i in node.inputs:
                    i.tag.context_name = context_name
                try:
-                    new_op = maker(node, context_name, node.inputs)
+                    new_op = maker(node.op, context_name, node.inputs)
                except TypeError:
-                    new_op = maker(node, context_name)
+                    new_op = maker(node.op, context_name, node.inputs, out_clients)
                # This is needed as sometimes new_op inherits from OP.
                if new_op and new_op != node.op:
                    if isinstance(new_op, theano.Op):
@@ -278,32 +288,45 @@ class GraphToGPU(Optimizer):
                continue
            # Move only if any of the inputs are on the GPU.
-            move_to_GPU = False
+            move_to_GPU = True
+            '''
            if any([isinstance(i, GpuArrayVariable) or
                    isinstance(i, GpuArraySharedVariable)
                    for i in [mapping[v] for v in node.inputs] +
                              node.outputs]):
                move_to_GPU = True
+            '''
+            out_clients = []
+            for o in node.outputs:
+                if o.clients:
+                    out_clients.append(o.clients)
+                else:
+                    out_clients.append([])
+            context_name = None
+            for i in [mapping[i] for i in node.inputs]:
+                if isinstance(i.type, GpuArrayType):
+                    context_name = i.type.context_name
+                    break
            new_ops = None
+            outputs = []
            # Apply the lifter
            for lopt in (self.local_optimizers_all +
                         self.local_optimizers_map.get(type(node.op), []) +
                         self.local_optimizers_map.get(node.op, [])):
-                replace = False
-                for i in [mapping[i] for i in node.inputs]:
+                if move_to_GPU:
-                    if isinstance(i.type, GpuArrayType):
-                        context_name = i.type.context_name
-                        replace = True
-                        break
-                if replace:
                    try:
                        new_ops = lopt.transform(
-                            node, context_name,
+                            node.op, context_name,
                            [mapping[i] for i in node.inputs])
                    except TypeError:
-                        new_ops = lopt.transform(node, context_name)
+                        new_ops = lopt.transform(node.op, context_name, 
+                            [mapping[i] for i in node.inputs],
+                            out_clients)
                    if new_ops:
                        break
            if not new_ops:
@@ -422,18 +445,18 @@ def local_gpuaalloc2(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.Alloc])
 @register_opt2([tensor.Alloc], 'fast_compile')
-def local_gpuaalloc(node, context_name):
+def local_gpuaalloc(op, context_name, inputs):
-    return GpuAlloc(context_name)(*node.inputs)
+    return GpuAlloc(context_name)(*inputs)
 @register_opt('fast_compile')
 @op_lifter([tensor.AllocEmpty])
 @register_opt2([tensor.AllocEmpty], 'fast_compile')
-def local_gpuaallocempty(node, context_name):
+def local_gpuaallocempty(op, context_name, inputs):
    # We use _props_dict() to make sure that the GPU op know all the
    # CPU op props.
    return GpuAllocEmpty(context_name=context_name,
-                         **node.op._props_dict())(*node.inputs)
+                         **op._props_dict())(*inputs)
 @register_opt()
@@ -479,15 +502,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.extra_ops.CpuContiguous])
 @register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
-def local_gpu_contiguous(node, context_name):
+def local_gpu_contiguous(op, context_name, inputs):
    return gpu_contiguous
 @register_opt('fast_compile')
 @op_lifter([tensor.Reshape])
 @register_opt2([tensor.Reshape], 'fast_compile')
-def local_gpureshape(node, context_name):
+def local_gpureshape(op, context_name, inputs):
-    op = node.op
    name = op.name
    if name:
        name = 'Gpu' + name
@@ -498,15 +520,14 @@ def local_gpureshape(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.Rebroadcast])
 @register_opt2([tensor.Rebroadcast], 'fast_compile')
-def local_gpu_rebroadcast(node, context_name, inputs):
+def local_gpu_rebroadcast(op, context_name, inputs):
-    return node.op(as_gpuarray_variable(inputs[0], context_name))
+    return op(as_gpuarray_variable(inputs[0], context_name))
 @register_opt('fast_compile')
 @op_lifter([tensor.Flatten])
 @register_opt2([tensor.Flatten], 'fast_compile')
-def local_gpuflatten(node, context_name, inputs):
+def local_gpuflatten(op, context_name, inputs):
-    op = node.op
    shp = []
    if op.outdim != 1:
        shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
@@ -519,10 +540,10 @@ def local_gpuflatten(node, context_name, inputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.Elemwise])
 @register_opt2([tensor.Elemwise], 'fast_compile')
-def local_gpu_elemwise(node, context_name, inputs):
+def local_gpu_elemwise(op, context_name, inputs):
-    op = node.op
    scal_op = op.scalar_op
    name = op.name
+    node = op.make_node(*inputs)
    if name:
        name = 'Gpu' + name
    if len(node.outputs) > 1:
@@ -593,15 +614,15 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
 @register_opt('fast_compile')
 @op_lifter([tensor.DimShuffle])
 @register_opt2([tensor.DimShuffle], 'fast_compile')
-def local_gpua_dimshuffle(node, context_name):
+def local_gpua_dimshuffle(op, context_name, inputs):
-    return GpuDimShuffle(node.op.input_broadcastable,
+    return GpuDimShuffle(op.input_broadcastable,
-                         node.op.new_order)
+                         op.new_order)
 @register_opt('fast_compile')
 @op_lifter([tensor.SpecifyShape])
 @register_opt2([tensor.SpecifyShape], 'fast_compile')
-def local_gpua_specifyShape(node, context_name, inputs):
+def local_gpua_specifyShape(op, context_name, inputs):
    if isinstance(inputs[0].type, GpuArrayType):
        return
    inp = [as_gpuarray_variable(inputs[0], context_name)]
@@ -627,11 +648,11 @@ def gpu_print_wrapper(op, cnda):
 @register_opt('fast_compile')
 @op_lifter([tensor.printing.Print])
 @register_opt2([tensor.printing.Print], 'fast_compile')
-def local_gpu_print_op(node, context_name, inputs):
+def local_gpu_print_op(op, context_name, inputs):
    x, = inputs
    gpu_x = as_gpuarray_variable(x, context_name=context_name)
-    new_op = node.op.__class__(global_fn=gpu_print_wrapper)
+    new_op = op.__class__(global_fn=gpu_print_wrapper)
-    new_op.old_op = node.op
+    new_op.old_op = op
    return new_op(gpu_x)
@@ -706,8 +727,8 @@ def local_gpu_pdbbreakpoint_op(node):
 @register_opt('fast_compile')
 @op_lifter([IfElse])
 @register_opt2([IfElse], 'fast_compile')
-def local_gpua_lazy_ifelse(node, context_name, inputs):
+def local_gpua_lazy_ifelse(op, context_name, inputs):
-    if node.op.gpu:
+    if op.gpu:
        return
    c = inputs[0]
    inps = []
@@ -716,13 +737,13 @@ def local_gpua_lazy_ifelse(node, context_name, inputs):
            inps.append(as_gpuarray_variable(v, context_name))
        else:
            inps.append(v)
-    return IfElse(node.op.n_outs, gpu=True)(c, *inps, return_list=True)
+    return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True)
 @register_opt('fast_compile')
 @op_lifter([tensor.Join])
 @register_opt2([tensor.Join], 'fast_compile')
-def local_gpua_join(node, context_name):
+def local_gpua_join(op, context_name, inputs):
    return gpu_join
@@ -738,16 +759,17 @@ def local_gpuajoin_1(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.Split])
 @register_opt2([tensor.Split], 'fast_compile')
-def local_gpua_split(node, context_name):
+def local_gpua_split(op, context_name, inputs):
 #TODO use props
-    return GpuSplit(node.op.len_splits)
+    return GpuSplit(op.len_splits)
 @register_opt('fast_compile')
 @op_lifter([tensor.Subtensor])
 @register_opt2([tensor.Subtensor], 'fast_compile')
-def local_gpua_subtensor(node, context_name):
+def local_gpua_subtensor(op, context_name, inputs, clients):
-    x = node.inputs[0]
+    x = inputs[0]
+    node = op.make_node(*inputs)
    if (x.owner and isinstance(x.owner.op, HostFromGpu)):
        gpu_x = x.owner.inputs[0]
        if (gpu_x.owner and
@@ -757,23 +779,23 @@ def local_gpua_subtensor(node, context_name):
            if len(x.clients) == 1:
                if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
                                              for v in n.inputs + n.outputs])
-                        for n, _ in node.outputs[0].clients]):
+                        for n, _ in clients[0]]):
                    return
                else:
                    return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
-    return GpuSubtensor(node.op.idx_list)
+    return GpuSubtensor(op.idx_list)
 @register_opt('fast_compile')
 @op_lifter([tensor.IncSubtensor])
 @register_opt2([tensor.IncSubtensor], 'fast_compile')
-def local_gpua_incsubtensor(node, context_name, inputs):
+def local_gpua_incsubtensor(op, context_name, inputs):
-    op = GpuIncSubtensor(node.op.idx_list, node.op.inplace,
+    op = GpuIncSubtensor(op.idx_list, op.inplace,
-                         node.op.set_instead_of_inc,
+                         op.set_instead_of_inc,
-                         node.op.destroyhandler_tolerate_aliased)
+                         op.destroyhandler_tolerate_aliased)
    ret = op(*inputs)
-    val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
+    val = getattr(op.make_node(*inputs).outputs[0].tag, 'nan_guard_mode_check', True)
    ret.tag.nan_guard_mode_check = val
    return ret
@@ -781,20 +803,20 @@ def local_gpua_incsubtensor(node, context_name, inputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedSubtensor1])
 @register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
-def local_gpua_advanced_subtensor(node, context_name):
+def local_gpua_advanced_subtensor(op, context_name, inputs):
    return GpuAdvancedSubtensor1()
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
-def local_gpua_advanced_incsubtensor(node, context_name):
+def local_gpua_advanced_incsubtensor(op, context_name, inputs):
    context = get_context(context_name)
    # This is disabled on non-cuda contexts
    if context.kind != b'cuda':
        return None
-    x, y, ilist = node.inputs
+    x, y, ilist = inputs
    # Gpu Ops needs both inputs to have the same dtype
    if (x.type.dtype != y.type.dtype):
@@ -804,7 +826,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
        if y.type.dtype != dtype:
            y = tensor.cast(y, dtype)
-    set_instead_of_inc = node.op.set_instead_of_inc
+    set_instead_of_inc = op.set_instead_of_inc
    compute_capability = int(context.bin_id[-2])
@@ -828,29 +850,31 @@ def local_advincsub1_gpua_inplace(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
 @register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
-def local_gpua_careduce(node, context_name):
+def local_gpua_careduce(op, context_name, inputs):
-    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
+    if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
+        node = op.make_node(*inputs)
        ctx = get_context(context_name)
        if ctx.kind == b'opencl':
-            op = GpuCAReduceCPY
+            op2 = GpuCAReduceCPY
-            if node.op.scalar_op not in [scalar.add, scalar.mul]:
+            if op.scalar_op not in [scalar.add, scalar.mul]:
                # We don't support yet all reduction with cpy code.
                return
        elif ctx.kind == b'cuda':
-            op = GpuCAReduceCuda
+            op2 = GpuCAReduceCuda
        else:
            return False
-        x, = node.inputs
+        x, = inputs
-        greduce = op(
+        greduce = op2(
-            node.op.scalar_op, axis=node.op.axis,
+            op.scalar_op, axis=op.axis,
-            dtype=getattr(node.op, 'dtype', None),
+            dtype=getattr(op, 'dtype', None),
-            acc_dtype=getattr(node.op, 'acc_dtype', None))
+            acc_dtype=getattr(op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
-        if (op is GpuCAReduceCPY or
+        if (op2 is GpuCAReduceCPY or
                gvar.owner.op.supports_c_code([
                    as_gpuarray_variable(x, context_name)])):
            return greduce
@@ -861,11 +885,11 @@ def local_gpua_careduce(node, context_name):
            # to make them a single dimension, do the reduction, and
            # then reshape to get them back.
-            if node.op.axis is None:
+            if op.axis is None:
                reduce_mask = [1] * x.type.ndim
            else:
                reduce_mask = [0] * x.type.ndim
-                for a in node.op.axis:
+                for a in op.axis:
                    assert reduce_mask[a] == 0
                    reduce_mask[a] = 1
@@ -881,11 +905,11 @@ def local_gpua_careduce(node, context_name):
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
-            greduce = op(
+            greduce = op2(
-                node.op.scalar_op,
+                op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
-                dtype=getattr(node.op, 'dtype', None),
+                dtype=getattr(op, 'dtype', None),
-                acc_dtype=getattr(node.op, 'acc_dtype', None))
+                acc_dtype=getattr(op, 'acc_dtype', None))
            reshaped_x = x.reshape(tensor.stack(new_in_shp))
            gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
@@ -912,8 +936,8 @@ def local_gpua_careduce(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 @register_opt2([tensor.blas.Gemv], 'fast_compile')
-def local_gpua_gemv(node, context_name):
+def local_gpua_gemv(op, context_name, inputs):
-    if node.op.inplace:
+    if op.inplace:
        return gpugemv_inplace
    else:
        return gpugemv_no_inplace
@@ -922,8 +946,8 @@ def local_gpua_gemv(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemm])
 @register_opt2([tensor.blas.Gemm], 'fast_compile')
-def local_gpua_gemm(node, context_name):
+def local_gpua_gemm(op, context_name, inputs):
-    if node.op.inplace:
+    if op.inplace:
        return gpugemm_inplace
    else:
        return gpugemm_no_inplace
@@ -932,7 +956,7 @@ def local_gpua_gemm(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.BatchedDot])
 @register_opt2([tensor.blas.BatchedDot], 'fast_compile')
-def local_gpua_gemmbatch(node, context_name, inputs):
+def local_gpua_gemmbatch(op, context_name, inputs):
    a, b = inputs
    c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@@ -941,7 +965,7 @@ def local_gpua_gemmbatch(node, context_name, inputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Dot])
 @register_opt2([tensor.basic.Dot], 'fast_compile')
-def local_gpua_hgemm(node, context_name, inputs):
+def local_gpua_hgemm(op, context_name, inputs):
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
        _logger.warning("Not performing dot of float16 on the GPU since "
@@ -985,20 +1009,20 @@ def local_gpuagemmbatch_output_merge(node, *inputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
-def local_gpua_ger(node, context_name):
+def local_gpua_ger(op, context_name, inputs):
-    return GpuGer(inplace=node.op.destructive)
+    return GpuGer(inplace=op.destructive)
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Dot22])
-def local_gpua_dot22(node, context_name):
+def local_gpua_dot22(op, context_name, inputs):
    return gpu_dot22
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Dot22Scalar])
 @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
-def local_gpua_dot22scalar(node, context_name, inputs):
+def local_gpua_dot22scalar(op, context_name, inputs):
    x, y, a = inputs
    x = as_gpuarray_variable(x, context_name)
    y = as_gpuarray_variable(y, context_name)
@@ -1009,28 +1033,28 @@ def local_gpua_dot22scalar(node, context_name, inputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Eye])
 @register_opt2([tensor.basic.Eye], 'fast_compile')
-def local_gpua_eye(node, context_name):
+def local_gpua_eye(op, context_name, inputs):
-    return GpuEye(dtype=node.op.dtype, context_name=context_name)
+    return GpuEye(dtype=op.dtype, context_name=context_name)
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
 @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
-def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
+def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs):
    return gpu_crossentropy_softmax_argmax_1hot_with_bias
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
 @register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
-def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
+def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs):
    return gpu_crossentropy_softmax_1hot_with_bias_dx
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.Softmax], cuda_only=True)
 @register_opt2([tensor.nnet.Softmax], 'fast_compile')
-def local_gpua_softmax(node, context_name):
+def local_gpua_softmax(op, context_name, inputs):
    return gpu_softmax
@@ -1043,17 +1067,20 @@ def local_gpua_softmaxwithbias(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([theano.tensor.opt.Assert])
-def local_assert(node, context_name):
+@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
+def local_assert(op, context_name, inputs):
    # Check if input nodes are already on the GPU
    if isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0],
+    return [host_from_gpu(op(as_gpuarray_variable(inputs[0],
+                                                  context_name),
+                             *inputs[1:]))]
 @register_opt('fast_compile')
 @op_lifter([ConvOp])
-def local_error_convop(node, context_name):
+def local_error_convop(op, context_name, inputs):
    assert False, """
 ConvOp does not work with the gpuarray backend.
@@ -1065,8 +1092,8 @@ theano.tensor.nnet.conv2d()
 @register_opt('fast_compile')
 @op_lifter([SparseBlockGemv])
 @register_opt2([SparseBlockGemv], 'fast_compile')
-def local_lift_sparseblockgemv(node, context_name):
+def local_lift_sparseblockgemv(op, context_name, inputs):
-    if node.op.inplace:
+    if op.inplace:
        return gpu_sparse_block_gemv_inplace
    else:
        return gpu_sparse_block_gemv
@@ -1075,8 +1102,8 @@ def local_lift_sparseblockgemv(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([SparseBlockOuter])
 @register_opt2([SparseBlockOuter], 'fast_compile')
-def local_lift_sparseblockouter(node, context_name):
+def local_lift_sparseblockouter(op, context_name, inputs):
-    if node.op.inplace:
+    if op.inplace:
        return gpu_sparse_block_outer_inplace
    else:
        return gpu_sparse_block_outer
@@ -1104,8 +1131,8 @@ def local_inplace_sparseblockouter(node):
 @register_opt2([AbstractConv2d,
                AbstractConv2d_gradWeights,
                AbstractConv2d_gradInputs], 'fast_compile')
-def local_lift_abstractconv2d(node, context_name, inputs):
+def local_lift_abstractconv2d(op, context_name, inputs):
-    if isinstance(node.outputs[0].type, GpuArrayType):
+    if isinstance(inputs[0].type, GpuArrayType):
        # Don't handle this node here, it's already on the GPU.
        return
    inps = list(inputs)
@@ -1113,7 +1140,7 @@ def local_lift_abstractconv2d(node, context_name, inputs):
                                   context_name=context_name)
    inps[1] = as_gpuarray_variable(inputs[1],
                                   context_name=context_name)
-    return [node.op(*inps)]
+    return [op(*inps)]
 # Register this here so that it goes after the abstract lifting
 register_opt('fast_compile')(conv_groupopt)
@@ -1220,36 +1247,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
 @register_opt('scan', 'fast_compile')
 @op_lifter([scan_op.Scan])
-#@register_opt2([scan_op.Scan], 'fast_compile')
+@register_opt2([scan_op.Scan], 'fast_compile')
-def local_scan_to_gpua(node, context_name):
+def local_scan_to_gpua(op, context_name, inputs):
-    info = copy.deepcopy(node.op.info)
+    info = copy.deepcopy(op.info)
    if info.get('gpua', False):
        return
    info['gpua'] = True
-    nw_ins = [node.inputs[0]]
+    nw_ins = [inputs[0]]
    e = (1 +
-         node.op.n_seqs +
+         op.n_seqs +
-         node.op.n_mit_mot +
+         op.n_mit_mot +
-         node.op.n_mit_sot +
+         op.n_mit_sot +
-         node.op.n_sit_sot +
+         op.n_sit_sot +
-         node.op.n_shared_outs)
+         op.n_shared_outs)
-    nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
+    nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]]
    b = e
-    e = e + node.op.n_nit_sot
+    e = e + op.n_nit_sot
-    nw_ins += node.inputs[b:e]
+    nw_ins += inputs[b:e]
-    nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
+    nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]]
-    scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]
+    scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs]
    # The inner output corresponding to the looping condition should not be
    # moved to the gpu
-    if node.op.info['as_while']:
+    if op.info['as_while']:
-        scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
+        scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]]
-        scan_outs += [node.op.outputs[-1]]
+        scan_outs += [op.outputs[-1]]
    else:
-        scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
+        scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs]
    scan_outs = scan_utils.clone(
        scan_outs,
-        replace=list(zip(node.op.inputs,
+        replace=list(zip(op.inputs,
                         (safe_to_cpu(x) for x in scan_ins))))
    # We need to construct the hash here, because scan