Faster gpu opt: use isinstance instead of ==, as it is faster.

Also import a module in an opt only once.

Faster gpu opt: use isinstance instead of ==, as it is faster.
e532ac67 · Frederic · cefb3421 · e532ac67
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -18,7 +18,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    device_properties, gpu_eye,
-    gpu_from_host, host_from_gpu, HostFromGpu,
+    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
@@ -42,6 +42,7 @@ from theano.sandbox.cuda.elemwise import erfinv_gpu
 from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
+linalg = None
 #optdb.print_summary()  # shows what is currently registered
@@ -236,7 +237,7 @@ def local_gpu_elemwise_1(node):
    """
    gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_i, = node.inputs
        if (host_i.owner and
            isinstance(host_i.owner.op, tensor.Elemwise) and
@@ -280,7 +281,7 @@ def local_gpu_dimshuffle_0(node):
            new_op = GpuDimShuffle(node.op.input_broadcastable,
                    node.op.new_order)
            return [host_from_gpu(new_op(gpu_from_host(input)))]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.DimShuffle):
@@ -303,7 +304,7 @@ def local_gpu_specifyShape_0(node):
        if input.owner and isinstance(input.owner.op, HostFromGpu):
            return [host_from_gpu(tensor.specify_shape(gpu_from_host(input),
                                                      *node.inputs[1:]))]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.SpecifyShape):
@@ -330,7 +331,7 @@ def local_gpu_dot_to_dot22(node):
    # In case the got do input upcast, we much check that we can
    # make it run on the gpu.
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        if node.outputs[0].type.dtype != 'float32':
            return False
        host_input = node.inputs[0]
@@ -355,7 +356,7 @@ def local_gpu_dot_to_dot22(node):
    if node.op == tensor.basic.dot:
        if node.outputs[0].type.dtype != 'float32':
            return False
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            x, y = node.inputs
            if _is_real_vector(x) and _is_real_matrix(y):
@@ -389,7 +390,7 @@ def local_gpu_lazy_ifelse(node):
        gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
        outs_clients = reduce(list.__add__,
                              [out.clients for out in node.outputs])
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+        if numpy.any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                      for i in node.inputs]) or numpy.any(
                      [c != 'output' and c.op == gpu_from_host for c, idx
                       in outs_clients]):
@@ -406,7 +407,7 @@ def local_gpu_lazy_ifelse(node):
            return [host_from_gpu(out) for out in
                    gpu_ifelse.make_node(c, *outs).outputs]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, theano.ifelse.IfElse) and
@@ -443,13 +444,14 @@ def local_gpu_dot22(node):
    dot(host_from_gpu) -> host_from_gpu(gpudot22)
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op == tensor.blas._dot22:
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.blas.Dot22):
            x, y = host_input.owner.inputs
            return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
-    if node.op == tensor.blas._dot22:
+    if isinstance(node.op, tensor.blas.Dot22):
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+        if numpy.any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                      for i in node.inputs]):
            x, y = node.inputs
            return [host_from_gpu(gpu_dot22(gpu_from_host(x),
@@ -465,15 +467,16 @@ def local_gpu_dot22scalar(node):
    dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
-            host_input.owner.op == tensor.blas._dot22scalar):
+            isinstance(host_input.owner.op,
+                       tensor.blas.Dot22Scalar)):
            x, y, scalar = host_input.owner.inputs
            return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),
                                    tensor.blas._as_scalar(scalar))]
-    if node.op == tensor.blas._dot22scalar:
+    if isinstance(node.op, tensor.blas.Dot22Scalar):
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            x, y, scalar = node.inputs
            return [host_from_gpu(
@@ -491,14 +494,12 @@ def local_gpu_gemv(node):
    gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
    """
-    gemvs = [tensor.blas.gemv_inplace,
+    gemvs = (tensor.blas.Gemv,
-             tensor.blas.gemv_no_inplace,
+             tensor.blas_c.CGemv,
-             tensor.blas_c.cgemv_inplace,
+            )
-             tensor.blas_c.cgemv_no_inplace,
+    if isinstance(node.op, GpuFromHost):
-            ]
-    if node.op == gpu_from_host:
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op in gemvs:
+        if host_input.owner and isinstance(host_input.owner.op, gemvs):
            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
            return [gpu_gemv_no_inplace(
@@ -507,11 +508,11 @@ def local_gpu_gemv(node):
                    gpu_from_host(x),
                    gpu_from_host(y),
                    b)]
-    if node.op in gemvs:
+    if isinstance(node.op, gemvs):
        z, a, x, y, b = node.inputs
-        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
-        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
-        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
            return [host_from_gpu(
                gpu_gemv_no_inplace(
@@ -532,17 +533,14 @@ def local_gpu_ger(node):
    ger(host_from_gpu) -> host_from_gpu(gpu_ger)
    """
-    gers = [tensor.blas_c.cger_inplace,
+    gers = (tensor.blas_c.CGer,
-            tensor.blas_c.cger_no_inplace,
+            tensor.blas.Ger,
-            tensor.blas.ger_destructive,
+            tensor.blas_scipy.ScipyGer,
-            tensor.blas.ger,
+        )
-            tensor.blas_scipy.scipy_ger_inplace,
-            tensor.blas_scipy.scipy_ger_no_inplace,
+    if isinstance(node.op, GpuFromHost):
-            ]
-    if node.op == gpu_from_host:
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op in gers:
+        if host_input.owner and isinstance(host_input.owner.op, gers):
            op = host_input.owner.op
            z, a, x, y = host_input.owner.inputs
            return [gpu_ger_no_inplace(
@@ -551,11 +549,11 @@ def local_gpu_ger(node):
                    gpu_from_host(x),
                    gpu_from_host(y)
                    )]
-    if node.op in gers:
+    if isinstance(node.op, gers):
        z, a, x, y = node.inputs
-        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
-        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
-        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
            return [host_from_gpu(
                gpu_ger_no_inplace(
@@ -575,13 +573,10 @@ def local_gpu_gemm(node):
    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
    """
-    gemms = [
+    if isinstance(node.op, GpuFromHost):
-        tensor.blas.gemm_inplace,
-        tensor.blas.gemm_no_inplace,
-    ]
-    if node.op == gpu_from_host:
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op in gemms:
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.blas.Gemm):
            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
            return [gpu_gemm_no_inplace(gpu_from_host(z),
@@ -589,11 +584,11 @@ def local_gpu_gemm(node):
                              gpu_from_host(x),
                              gpu_from_host(y),
                              b)]
-    if node.op in gemms:
+    if isinstance(node.op, tensor.blas.Gemm):
        z, a, x, y, b = node.inputs
-        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
+        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
-        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
+        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
-        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
            return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
                                                 a,
@@ -618,7 +613,7 @@ def local_gpu_careduce(node):
        # and max does not support all combinations of axes
        if node.op.scalar_op in [scal.add, scal.mul, scal.maximum, scal.minimum]:
            x, = node.inputs
-            if x.owner and x.owner.op == host_from_gpu:
+            if x.owner and isinstance(x.owner.op, HostFromGpu):
                if node.op.axis is None:
                    reduce_mask = [1] * x.type.ndim
                else:
@@ -688,7 +683,7 @@ def local_gpu_careduce(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Reshape])
 def local_gpu_reshape(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Reshape):
@@ -705,7 +700,7 @@ def local_gpu_reshape(node):
            return [gpu_reshape]
    if isinstance(node.op, tensor.Reshape):
        x, shp = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
@@ -722,7 +717,7 @@ def local_gpu_reshape(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Flatten])
 def local_gpu_flatten(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Flatten):
@@ -732,7 +727,7 @@ def local_gpu_flatten(node):
    if isinstance(node.op, tensor.Flatten):
        x, = node.inputs
        outdim = node.op.outdim
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            return [host_from_gpu(GpuFlatten(outdim)(gpu_x))]
    return False
@@ -741,7 +736,7 @@ def local_gpu_flatten(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Subtensor])
 def local_gpu_subtensor(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Subtensor):
@@ -751,9 +746,11 @@ def local_gpu_subtensor(node):
            return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
    if isinstance(node.op, tensor.Subtensor):
        x = node.inputs[0]
-        coords = node.inputs[1:]
+        if (x.owner and
-        if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
+            isinstance(x.owner.op, HostFromGpu) and
+            x.dtype == "float32"):
            gpu_x, = x.owner.inputs
+            coords = node.inputs[1:]
            return [host_from_gpu(GpuSubtensor(
                node.op.idx_list)(gpu_x, *coords))]
    return False
@@ -762,7 +759,7 @@ def local_gpu_subtensor(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1])
 def local_gpu_advanced_subtensor1(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
@@ -772,7 +769,7 @@ def local_gpu_advanced_subtensor1(node):
    if node.op.__class__ is tensor.AdvancedSubtensor1:
        x = node.inputs[0]
        coords = node.inputs[1:]
-        if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
+        if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
            gpu_x, = x.owner.inputs
            return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
    return False
@@ -781,7 +778,7 @@ def local_gpu_advanced_subtensor1(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1])
 def local_gpu_advanced_incsubtensor1(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        # Should not execute for GpuAdvancedIncSubtensor1
        if host_input.owner and \
@@ -816,12 +813,12 @@ def local_gpu_advanced_incsubtensor1(node):
        x, y = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            go_gpu = True
            gpu_x, = x.owner.inputs
        else:
            gpu_x = gpu_from_host(x)
-        if y.owner and y.owner.op == host_from_gpu:
+        if y.owner and isinstance(y.owner.op, HostFromGpu):
            go_gpu = True
            gpu_y, = y.owner.inputs
        else:
@@ -855,7 +852,7 @@ def local_gpu_advanced_incsubtensor1(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.IncSubtensor])
 def local_gpu_incsubtensor(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_output = node.inputs[0]
        if host_output.owner and \
           type(host_output.owner.op) == tensor.IncSubtensor:
@@ -879,12 +876,12 @@ def local_gpu_incsubtensor(node):
        assert isinstance(y.type, tensor.TensorType)
        coords = node.inputs[2:]
        go_gpu = False
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            go_gpu = True
            gpu_x, = x.owner.inputs
        else:
            gpu_x = gpu_from_host(x)
-        if y.owner and y.owner.op == host_from_gpu:
+        if y.owner and isinstance(y.owner.op, HostFromGpu):
            go_gpu = True
            gpu_y, = y.owner.inputs
        else:
@@ -904,7 +901,7 @@ def local_gpu_incsubtensor(node):
 def local_gpu_shape(node):
    if isinstance(node.op, tensor.Shape):
        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            return [gpu_shape(gpu_x)]
    return False
@@ -916,7 +913,7 @@ def local_gpu_rebroadcast(node):
    '''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
    if isinstance(node.op, tensor.Rebroadcast):
        x, = node.inputs
-        if (x.owner and x.owner.op == host_from_gpu):
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            gpu_x = x.owner.inputs[0]
            return [host_from_gpu(node.op(gpu_x))]
@@ -930,7 +927,7 @@ def gpu_print_wrapper(op, cnda):
 def local_gpu_print_op(node):
    if isinstance(node.op, tensor.printing.Print):
        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            new_op = node.op.__class__(global_fn=gpu_print_wrapper)
            new_op.old_op = node.op
@@ -951,7 +948,7 @@ import theano.tensor.nnet
 def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
    if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
        x, b, y = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            # if y is a cast to integers, we can go to the underlying
            # thing if we want, since this gpu op will cast to integers
@@ -981,7 +978,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
 def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
    if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
        dnll, sm, yidx = node.inputs
-        if sm.owner and sm.owner.op == host_from_gpu:
+        if sm.owner and isinstance(sm.owner.op, HostFromGpu):
            gpu_sm, = sm.owner.inputs
            gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
                gpu_from_host(dnll),
@@ -996,7 +993,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
 def local_gpu_softmax(node):
    if isinstance(node.op, tensor.nnet.Softmax):
        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            gpu_sm = GpuSoftmax()(gpu_x)
            return [host_from_gpu(gpu_sm)]
@@ -1008,8 +1005,8 @@ def local_gpu_softmax(node):
 def local_gpu_softmax_with_bias(node):
    if isinstance(node.op, tensor.nnet.SoftmaxWithBias):
        x, b = node.inputs
-        x_on_gpu = x.owner and x.owner.op == host_from_gpu
+        x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
-        b_on_gpu = b.owner and b.owner.op == host_from_gpu
+        b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
        if x_on_gpu or b_on_gpu:
            gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b))
            return [host_from_gpu(gpu_sm)]
@@ -1081,7 +1078,7 @@ def local_gpu_conv(node):
            atol = 3e-5
        return CudaNdarrayType.values_eq_approx(a, b, atol=atol)
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
@@ -1101,8 +1098,8 @@ def local_gpu_conv(node):
    if isinstance(node.op, conv.ConvOp):
        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
        img, kern = node.inputs
-        img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
+        img_on_gpu = (img.owner and isinstance(img.owner.op, HostFromGpu))
-        kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
+        kern_on_gpu = (kern.owner and isinstance(kern.owner.op, HostFromGpu))
        if img_on_gpu or kern_on_gpu:
            gpu_conv = GpuConvOp_from_ConvOp(node.op)
            if gpu_conv is None:
@@ -1125,7 +1122,7 @@ import theano.tensor.signal.downsample as downsample
 def local_gpu_downsample_factor_max(node):
    if isinstance(node.op, downsample.DownsampleFactorMax):
        x, = node.inputs
-        if (x.owner and x.owner.op == host_from_gpu):
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
            return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
@@ -1135,7 +1132,7 @@ def local_gpu_downsample_factor_max(node):
 def local_gpu_downsample_factor_max_grad(node):
    if isinstance(node.op, downsample.DownsampleFactorMaxGrad):
        x, z, gz = node.inputs
-        if (x.owner and x.owner.op == host_from_gpu):
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
                                                     node.op.ignore_border)
            return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
@@ -1187,7 +1184,7 @@ def local_gpu_join(node):
        #print "OPT: axis_and_tensors=", axis_and_tensors
-        matches = [(not t.owner is None and t.owner.op == host_from_gpu) or
+        matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
                   isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
        #print "OPT: matches =", matches
@@ -1366,7 +1363,7 @@ def local_gpualloc(node):
    replace = False
    if node.op == tensor.alloc:
        if node.inputs[0].owner and \
-           node.inputs[0].owner.op == host_from_gpu:
+           isinstance(node.inputs[0].owner.op, HostFromGpu):
            replace = True
        elif all([c != 'output' and c.op == gpu_from_host
                for c, idx in node.outputs[0].clients]):
@@ -1427,14 +1424,14 @@ def local_gpu_eye(node):
    eye(host_from_gpu) -> host_from_gpu(gpueye)
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, tensor.Eye) and
            host_input.owner.op.dtype == "float32"):
            return [gpu_eye(*host_input.owner.inputs)]
    if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+        if numpy.any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                      for i in node.inputs]):
            return [host_from_gpu(gpu_eye(*node.inputs))]
    return False
@@ -1510,14 +1507,18 @@ def local_gpu_extract_diagonal(node):
    extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
    gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
    """
+    global linalg
+    if linalg is None:
        from theano.sandbox import linalg
+        linalg = theano.sandbox.linalg
    if (isinstance(node.op, linalg.ops.ExtractDiag) and
        isinstance(node.inputs[0].type,
                   theano.tensor.TensorType)):
        inp = node.inputs[0]
        if inp.owner and isinstance(inp.owner.op, HostFromGpu):
            return [host_from_gpu(linalg.extract_diag(gpu_from_host(inp)))]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, linalg.ops.ExtractDiag) and
@@ -1538,7 +1539,7 @@ def gpuScanOptimization(node):
    """
    #gpu_from_host(scan) -> GPUscan(gpu_from_host)
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, scan_op.Scan) and
@@ -1599,7 +1600,7 @@ def gpuScanOptimization(node):
    #scan(host_from_gpu) -> host_from_gpu(GPUscan)
    if (type(node.op) == scan_op.Scan
        and not node.op.info['gpu']):
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
+        if numpy.any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                      for i in node.inputs]):
            thescan = node.op