Merge pull request #1779 from nouiz/faster_opt

Faster opt

Merge pull request #1779 from nouiz/faster_opt
e93c61d1 · abergeron · 8cc9395f · 08d61b24 · e93c61d1 · e93c61d1
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1047,10 +1047,6 @@ class PatternSub(LocalOptimizer):
            self.__name__ = name
        self.pdb = pdb

-    def skip_identities(self, expr):
-        if self.skip_identities_fn:
-            return self.skip_identities_fn(expr)
-
    def op_key(self):
        return self.op

@@ -1064,10 +1060,13 @@ class PatternSub(LocalOptimizer):
        """
        if node.op != self.op:
            return False
-
+        #TODO: if we remove pdb, do this speed things up?
        def match(pattern, expr, u, allow_multiple_clients=False, pdb=False):
+            #TODO move outside match
            def retry_with_equiv():
-                expr_equiv = self.skip_identities(expr)
+                if not self.skip_identities_fn:
+                    return False
+                expr_equiv = self.skip_identities_fn(expr)
                if expr_equiv is None:
                    return False
                #TODO: Not sure how to handle multiple_clients flag
@@ -1126,19 +1125,19 @@ class PatternSub(LocalOptimizer):
                pdb.set_trace()
            return u

-        def build(pattern, u):
-            if isinstance(pattern, (list, tuple)):
-                args = [build(p, u) for p in pattern[1:]]
-                return pattern[0](*args)
-            elif isinstance(pattern, basestring):
-                return u[unify.Var(pattern)]
-            elif isinstance(pattern, (int, float)):
-                return pattern
-            else:
-                return pattern.clone()
        u = match(self.in_pattern, node.out, unify.Unification(), True,
                  self.pdb)
        if u:
+            def build(pattern, u):
+                if isinstance(pattern, (list, tuple)):
+                    args = [build(p, u) for p in pattern[1:]]
+                    return pattern[0](*args)
+                elif isinstance(pattern, basestring):
+                    return u[unify.Var(pattern)]
+                elif isinstance(pattern, (int, float)):
+                    return pattern
+                else:
+                    return pattern.clone()
            p = self.out_pattern
            new = build(p, u)
            ####print "PatternSub matched:", new
@@ -1520,19 +1519,23 @@ class EquilibriumOptimizer(NavigatorOptimizer):
    def __init__(self,
                 optimizers,
                 failure_callback=None,
+                 ignore_newtrees=True,
                 max_use_ratio=None):
-        """
+        """ Apply optimizations until equilibrium point.
+
        :param optimizers:  list or set of local or global optimizations to
            apply until equilibrium.

        :param max_use_ratio: each optimizer can be applied at most
            (size of graph * this number) times
+        :param ignore_newtrees: See EquilibriumDB ignore_newtrees
+            parameter definition

        """

        super(EquilibriumOptimizer, self).__init__(
            None,
-            ignore_newtrees=True,
+            ignore_newtrees=ignore_newtrees,
            failure_callback=failure_callback)
        self.local_optimizers_map = dict()
        self.local_optimizers_all = []

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -179,23 +179,33 @@ class Query(object):


 class EquilibriumDB(DB):
-    """ A set of potential optimizations which should be applied in an
+    """A set of potential optimizations which should be applied in an
        arbitrary order until equilibrium is reached.

    Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.

+    :param ignore_newtrees: If False, we will apply local opt on new
+        node introduced during local optimization application. This
+        could result in less fgraph iterations, but this don't mean it
+        will be faster globally.
+
    .. note::

        We can put LocalOptimizer and Optimizer as EquilibriumOptimizer
        suppor both.

    """
+    def __init__(self, ignore_newtrees=True):
+        super(EquilibriumDB, self).__init__()
+        self.ignore_newtrees = ignore_newtrees

    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
-        return opt.EquilibriumOptimizer(opts,
-                max_use_ratio=config.optdb.max_use_ratio,
-                failure_callback=opt.NavigatorOptimizer.warn_inplace)
+        return opt.EquilibriumOptimizer(
+            opts,
+            max_use_ratio=config.optdb.max_use_ratio,
+            ignore_newtrees=self.ignore_newtrees,
+            failure_callback=opt.NavigatorOptimizer.warn_inplace)


 class SequenceDB(DB):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -18,7 +18,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    device_properties, gpu_eye,
-    gpu_from_host, host_from_gpu, HostFromGpu,
+    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
@@ -42,10 +42,14 @@ from theano.sandbox.cuda.elemwise import erfinv_gpu
 from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
+linalg = None

 #optdb.print_summary()  # shows what is currently registered

-gpu_optimizer = EquilibriumDB()
+#ignore_newtrees is to speed the optimization as this is the pattern
+#we use for optimization. Otherwise, we can iterate 100s of time on
+#the graph and apply only a few optimizations each time.
+gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
 gpu_cut_copies = EquilibriumDB()
 gpu_seqopt = SequenceDB()
 gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
@@ -65,6 +69,9 @@ optdb.register('gpu_after_fusion',
               optdb.__position__.get('elemwise_fusion', 49) + .1,
               'gpu')

+## Register merge_optimizer as a global opt
+gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer, 'fast_run')
+

 def register_opt(*tags, **kwargs):
    def f(local_opt):
@@ -76,6 +83,8 @@ def register_opt(*tags, **kwargs):
 #register local_track_shape_i at this level too
 #to make multi-level lift of shape work.
 register_opt()(theano.tensor.opt.local_track_shape_i)
+register_opt(name='gpu_constant_folding')(
+    tensor.opt.constant_folding)


 class InputToGpuOptimizer(Optimizer):
@@ -128,7 +137,7 @@ def local_cut_gpu_host_gpu(node):
        return [node.inputs[0].owner.inputs[0]]
    return False
 gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
-        'fast_run', 'inplace', 'gpu')
+        'fast_run', 'gpu')
 gpu_cut_copies.register('cut_gpu_constant_transfers',
                        tensor.opt.constant_folding,
                        'fast_run', 'gpu')
@@ -176,10 +185,10 @@ def local_gpu_elemwise_0(node):
    """
    if (isinstance(node.op, tensor.Elemwise) and
        dtype_in_elemwise_supported(node.op)):
-        if numpy.any([i.owner and
-                      isinstance(i.owner.op, HostFromGpu)
-                      for i in node.inputs]):
-            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
+        if any([i.owner and
+                isinstance(i.owner.op, HostFromGpu)
+                for i in node.inputs]):
+            if all([o.type.dtype == 'float32' for o in node.outputs]):
                # Don't set any inplace pattern.
                # gpu_inplace_elemwise_optimizer will do it later

@@ -196,14 +205,14 @@ def local_gpu_elemwise_0(node):
                upcastable = set(['float32', 'int8', 'int16', 'uint8',
                                  'uint16'])
                # case 1 - all inputs are already float32
-                if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
+                if all([i.type.dtype == 'float32' for i in node.inputs]):
                    #TODO: change this when fusion makes Elemwise with multiple
                    # outputs
                    gpu_elemwise = new_op(*(gpu_from_host(i)
                                            for i in node.inputs))
                # case 2 - it is still ok if some inputs were upcast to float32
-                elif numpy.all([i.type.dtype in upcastable
-                                for i in node.inputs]):
+                elif all([i.type.dtype in upcastable
+                          for i in node.inputs]):
                    # second - establish that a new node with upcasted inputs
                    # has the same outputs types as the original node
                    upcasted = node.op.make_node(*[tensor.cast(i, 'float32')
@@ -233,7 +242,7 @@ def local_gpu_elemwise_1(node):
    """
    gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_i, = node.inputs
        if (host_i.owner and
            isinstance(host_i.owner.op, tensor.Elemwise) and
@@ -277,7 +286,7 @@ def local_gpu_dimshuffle_0(node):
            new_op = GpuDimShuffle(node.op.input_broadcastable,
                    node.op.new_order)
            return [host_from_gpu(new_op(gpu_from_host(input)))]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.DimShuffle):
@@ -300,7 +309,7 @@ def local_gpu_specifyShape_0(node):
        if input.owner and isinstance(input.owner.op, HostFromGpu):
            return [host_from_gpu(tensor.specify_shape(gpu_from_host(input),
                                                      *node.inputs[1:]))]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.SpecifyShape):
@@ -327,7 +336,7 @@ def local_gpu_dot_to_dot22(node):

    # In case the got do input upcast, we much check that we can
    # make it run on the gpu.
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        if node.outputs[0].type.dtype != 'float32':
            return False
        host_input = node.inputs[0]
@@ -352,8 +361,8 @@ def local_gpu_dot_to_dot22(node):
    if node.op == tensor.basic.dot:
        if node.outputs[0].type.dtype != 'float32':
            return False
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                      for i in node.inputs]):
+        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                for i in node.inputs]):
            x, y = node.inputs
            if _is_real_vector(x) and _is_real_matrix(y):
                new_op = GpuDimShuffle((False,), ['x', 0])
@@ -386,10 +395,10 @@ def local_gpu_lazy_ifelse(node):
        gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
        outs_clients = reduce(list.__add__,
                              [out.clients for out in node.outputs])
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                      for i in node.inputs]) or numpy.any(
-                      [c != 'output' and c.op == gpu_from_host for c, idx
-                       in outs_clients]):
+        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
+                for i in node.inputs]) or any(
+                    [c != 'output' and c.op == gpu_from_host for c, idx
+                     in outs_clients]):

            c = node.inputs[0]
            outs = node.inputs[1:]
@@ -403,7 +412,7 @@ def local_gpu_lazy_ifelse(node):
            return [host_from_gpu(out) for out in
                    gpu_ifelse.make_node(c, *outs).outputs]

-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, theano.ifelse.IfElse) and
@@ -440,14 +449,15 @@ def local_gpu_dot22(node):

    dot(host_from_gpu) -> host_from_gpu(gpudot22)
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op == tensor.blas._dot22:
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.blas.Dot22):
            x, y = host_input.owner.inputs
            return [gpu_dot22(gpu_from_host(x), gpu_from_host(y))]
-    if node.op == tensor.blas._dot22:
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                      for i in node.inputs]):
+    if isinstance(node.op, tensor.blas.Dot22):
+        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
+                for i in node.inputs]):
            x, y = node.inputs
            return [host_from_gpu(gpu_dot22(gpu_from_host(x),
                                            gpu_from_host(y)))]
@@ -462,16 +472,17 @@ def local_gpu_dot22scalar(node):

    dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
-            host_input.owner.op == tensor.blas._dot22scalar):
+            isinstance(host_input.owner.op,
+                       tensor.blas.Dot22Scalar)):
            x, y, scalar = host_input.owner.inputs
            return [gpu_dot22scalar(gpu_from_host(x), gpu_from_host(y),
                                    tensor.blas._as_scalar(scalar))]
-    if node.op == tensor.blas._dot22scalar:
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                      for i in node.inputs]):
+    if isinstance(node.op, tensor.blas.Dot22Scalar):
+        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                for i in node.inputs]):
            x, y, scalar = node.inputs
            return [host_from_gpu(
                gpu_dot22scalar(gpu_from_host(x),
@@ -488,31 +499,28 @@ def local_gpu_gemv(node):
    gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)

    """
-    gemvs = {
-            tensor.blas.gemv_inplace: gpu_gemv_no_inplace,
-            tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
-            tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace,
-            tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
-            }
-    if node.op == gpu_from_host:
+    gemvs = (tensor.blas.Gemv,
+             tensor.blas_c.CGemv,
+            )
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op in gemvs:
+        if host_input.owner and isinstance(host_input.owner.op, gemvs):
            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
-            return [gemvs[op](
+            return [gpu_gemv_no_inplace(
                    gpu_from_host(z),
                    a,
                    gpu_from_host(x),
                    gpu_from_host(y),
                    b)]
-    if node.op in gemvs:
+    if isinstance(node.op, gemvs):
        z, a, x, y, b = node.inputs
-        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
-        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
-        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
+        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
+        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
            return [host_from_gpu(
-                gemvs[node.op](
+                gpu_gemv_no_inplace(
                    gpu_from_host(z),
                    a,
                    gpu_from_host(x),
@@ -530,33 +538,30 @@ def local_gpu_ger(node):
    ger(host_from_gpu) -> host_from_gpu(gpu_ger)

    """
-    gers = {
-            tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace,
-            tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
-            tensor.blas.Ger(destructive=True): gpu_ger_no_inplace,
-            tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
-            tensor.blas_scipy.ScipyGer(destructive=True): gpu_ger_no_inplace,
-            tensor.blas_scipy.ScipyGer(destructive=False): gpu_ger_no_inplace,
-            }
-    if node.op == gpu_from_host:
+    gers = (tensor.blas_c.CGer,
+            tensor.blas.Ger,
+            tensor.blas_scipy.ScipyGer,
+        )
+
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op in gers:
+        if host_input.owner and isinstance(host_input.owner.op, gers):
            op = host_input.owner.op
            z, a, x, y = host_input.owner.inputs
-            return [gers[op](
+            return [gpu_ger_no_inplace(
                    gpu_from_host(z),
                    a,
                    gpu_from_host(x),
                    gpu_from_host(y)
                    )]
-    if node.op in gers:
+    if isinstance(node.op, gers):
        z, a, x, y = node.inputs
-        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
-        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
-        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
+        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
+        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
            return [host_from_gpu(
-                gers[node.op](
+                gpu_ger_no_inplace(
                    gpu_from_host(z),
                    a,
                    gpu_from_host(x),
@@ -573,26 +578,24 @@ def local_gpu_gemm(node):

    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
    """
-    gemms = {
-            #tensor.blas.gemm_inplace: gpu_gemm_inplace,
-            tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op in gemms:
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.blas.Gemm):
            op = host_input.owner.op
            z, a, x, y, b = host_input.owner.inputs
-            return [gemms[op](gpu_from_host(z),
-                              a,
-                              gpu_from_host(x),
-                              gpu_from_host(y),
-                              b)]
-    if node.op in gemms:
+            return [gpu_gemm_no_inplace(gpu_from_host(z),
+                                        a,
+                                        gpu_from_host(x),
+                                        gpu_from_host(y),
+                                        b)]
+    if isinstance(node.op, tensor.blas.Gemm):
        z, a, x, y, b = node.inputs
-        x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
-        y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
-        z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
+        x_on_gpu = (x.owner and isinstance(x.owner.op, HostFromGpu))
+        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
+        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gemms[node.op](gpu_from_host(z),
+            return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
                                                 a,
                                                 gpu_from_host(x),
                                                 gpu_from_host(y),
@@ -613,9 +616,10 @@ def local_gpu_careduce(node):
        scalar_op = node.op.scalar_op
        # currently, only these two ops are supported at all,
        # and max does not support all combinations of axes
-        if node.op.scalar_op in [scal.add, scal.mul, scal.maximum, scal.minimum]:
+        if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
+                                          scal.Maximum, scal.Minimum)):
            x, = node.inputs
-            if x.owner and x.owner.op == host_from_gpu:
+            if x.owner and isinstance(x.owner.op, HostFromGpu):
                if node.op.axis is None:
                    reduce_mask = [1] * x.type.ndim
                else:
@@ -685,7 +689,7 @@ def local_gpu_careduce(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Reshape])
 def local_gpu_reshape(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Reshape):
@@ -702,7 +706,7 @@ def local_gpu_reshape(node):
            return [gpu_reshape]
    if isinstance(node.op, tensor.Reshape):
        x, shp = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
@@ -719,7 +723,7 @@ def local_gpu_reshape(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Flatten])
 def local_gpu_flatten(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Flatten):
@@ -729,7 +733,7 @@ def local_gpu_flatten(node):
    if isinstance(node.op, tensor.Flatten):
        x, = node.inputs
        outdim = node.op.outdim
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            return [host_from_gpu(GpuFlatten(outdim)(gpu_x))]
    return False
@@ -738,7 +742,7 @@ def local_gpu_flatten(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Subtensor])
 def local_gpu_subtensor(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Subtensor):
@@ -748,9 +752,11 @@ def local_gpu_subtensor(node):
            return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
    if isinstance(node.op, tensor.Subtensor):
        x = node.inputs[0]
-        coords = node.inputs[1:]
-        if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
+        if (x.owner and
+            isinstance(x.owner.op, HostFromGpu) and
+            x.dtype == "float32"):
            gpu_x, = x.owner.inputs
+            coords = node.inputs[1:]
            return [host_from_gpu(GpuSubtensor(
                node.op.idx_list)(gpu_x, *coords))]
    return False
@@ -759,7 +765,7 @@ def local_gpu_subtensor(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.AdvancedSubtensor1])
 def local_gpu_advanced_subtensor1(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
@@ -769,7 +775,7 @@ def local_gpu_advanced_subtensor1(node):
    if node.op.__class__ is tensor.AdvancedSubtensor1:
        x = node.inputs[0]
        coords = node.inputs[1:]
-        if x.owner and x.owner.op == host_from_gpu and x.dtype == "float32":
+        if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32":
            gpu_x, = x.owner.inputs
            return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
    return False
@@ -778,7 +784,7 @@ def local_gpu_advanced_subtensor1(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.AdvancedIncSubtensor1])
 def local_gpu_advanced_incsubtensor1(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        # Should not execute for GpuAdvancedIncSubtensor1
        if host_input.owner and \
@@ -813,12 +819,12 @@ def local_gpu_advanced_incsubtensor1(node):
        x, y = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            go_gpu = True
            gpu_x, = x.owner.inputs
        else:
            gpu_x = gpu_from_host(x)
-        if y.owner and y.owner.op == host_from_gpu:
+        if y.owner and isinstance(y.owner.op, HostFromGpu):
            go_gpu = True
            gpu_y, = y.owner.inputs
        else:
@@ -852,7 +858,7 @@ def local_gpu_advanced_incsubtensor1(node):
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.IncSubtensor])
 def local_gpu_incsubtensor(node):
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_output = node.inputs[0]
        if host_output.owner and \
           type(host_output.owner.op) == tensor.IncSubtensor:
@@ -876,12 +882,12 @@ def local_gpu_incsubtensor(node):
        assert isinstance(y.type, tensor.TensorType)
        coords = node.inputs[2:]
        go_gpu = False
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            go_gpu = True
            gpu_x, = x.owner.inputs
        else:
            gpu_x = gpu_from_host(x)
-        if y.owner and y.owner.op == host_from_gpu:
+        if y.owner and isinstance(y.owner.op, HostFromGpu):
            go_gpu = True
            gpu_y, = y.owner.inputs
        else:
@@ -901,7 +907,7 @@ def local_gpu_incsubtensor(node):
 def local_gpu_shape(node):
    if isinstance(node.op, tensor.Shape):
        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            return [gpu_shape(gpu_x)]
    return False
@@ -913,7 +919,7 @@ def local_gpu_rebroadcast(node):
    '''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
    if isinstance(node.op, tensor.Rebroadcast):
        x, = node.inputs
-        if (x.owner and x.owner.op == host_from_gpu):
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            gpu_x = x.owner.inputs[0]
            return [host_from_gpu(node.op(gpu_x))]

@@ -927,7 +933,7 @@ def gpu_print_wrapper(op, cnda):
 def local_gpu_print_op(node):
    if isinstance(node.op, tensor.printing.Print):
        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            new_op = node.op.__class__(global_fn=gpu_print_wrapper)
            new_op.old_op = node.op
@@ -948,7 +954,7 @@ import theano.tensor.nnet
 def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
    if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
        x, b, y = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            # if y is a cast to integers, we can go to the underlying
            # thing if we want, since this gpu op will cast to integers
@@ -978,7 +984,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
 def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
    if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
        dnll, sm, yidx = node.inputs
-        if sm.owner and sm.owner.op == host_from_gpu:
+        if sm.owner and isinstance(sm.owner.op, HostFromGpu):
            gpu_sm, = sm.owner.inputs
            gpu_dx = GpuCrossentropySoftmax1HotWithBiasDx()(
                gpu_from_host(dnll),
@@ -993,7 +999,7 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
 def local_gpu_softmax(node):
    if isinstance(node.op, tensor.nnet.Softmax):
        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
            gpu_sm = GpuSoftmax()(gpu_x)
            return [host_from_gpu(gpu_sm)]
@@ -1005,8 +1011,8 @@ def local_gpu_softmax(node):
 def local_gpu_softmax_with_bias(node):
    if isinstance(node.op, tensor.nnet.SoftmaxWithBias):
        x, b = node.inputs
-        x_on_gpu = x.owner and x.owner.op == host_from_gpu
-        b_on_gpu = b.owner and b.owner.op == host_from_gpu
+        x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
+        b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
        if x_on_gpu or b_on_gpu:
            gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b))
            return [host_from_gpu(gpu_sm)]
@@ -1078,7 +1084,7 @@ def local_gpu_conv(node):
            atol = 3e-5
        return CudaNdarrayType.values_eq_approx(a, b, atol=atol)

-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
@@ -1098,8 +1104,8 @@ def local_gpu_conv(node):
    if isinstance(node.op, conv.ConvOp):
        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
        img, kern = node.inputs
-        img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
-        kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
+        img_on_gpu = (img.owner and isinstance(img.owner.op, HostFromGpu))
+        kern_on_gpu = (kern.owner and isinstance(kern.owner.op, HostFromGpu))
        if img_on_gpu or kern_on_gpu:
            gpu_conv = GpuConvOp_from_ConvOp(node.op)
            if gpu_conv is None:
@@ -1122,7 +1128,7 @@ import theano.tensor.signal.downsample as downsample
 def local_gpu_downsample_factor_max(node):
    if isinstance(node.op, downsample.DownsampleFactorMax):
        x, = node.inputs
-        if (x.owner and x.owner.op == host_from_gpu):
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            gpu_ds = GpuDownsampleFactorMax(node.op.ds, node.op.ignore_border)
            return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]

@@ -1132,7 +1138,7 @@ def local_gpu_downsample_factor_max(node):
 def local_gpu_downsample_factor_max_grad(node):
    if isinstance(node.op, downsample.DownsampleFactorMaxGrad):
        x, z, gz = node.inputs
-        if (x.owner and x.owner.op == host_from_gpu):
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            gpu_ds_grad = GpuDownsampleFactorMaxGrad(node.op.ds,
                                                     node.op.ignore_border)
            return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
@@ -1184,12 +1190,12 @@ def local_gpu_join(node):

        #print "OPT: axis_and_tensors=", axis_and_tensors

-        matches = [(not t.owner is None and t.owner.op == host_from_gpu) or
+        matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
                   isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
        #print "OPT: matches =", matches

        # if all input tensors are host_from_gpu'ified
-        if numpy.all(matches):
+        if all(matches):
            # the extra gpu_from_host introduced here will
            # be removed by further optimizations
            new_tensors = [gpu_from_host(t) for t in axis_and_tensors[1:]]
@@ -1363,18 +1369,18 @@ def local_gpualloc(node):
    replace = False
    if node.op == tensor.alloc:
        if node.inputs[0].owner and \
-           node.inputs[0].owner.op == host_from_gpu:
+           isinstance(node.inputs[0].owner.op, HostFromGpu):
            replace = True
        elif all([c != 'output' and c.op == gpu_from_host
-                for c, idx in node.outputs[0].clients]):
+                  for c, idx in node.outputs[0].clients]):
            # if all clients are on gpu
            replace = True
        elif all([c != 'output' and
-                c.op == tensor.join and
-                all([i.owner and
-                     i.owner.op in [host_from_gpu, tensor.alloc]
-                     for i in c.inputs[1:]])
-                for c, idx in node.outputs[0].clients]):
+                  c.op == tensor.join and
+                  all([i.owner and
+                       i.owner.op in [host_from_gpu, tensor.alloc]
+                       for i in c.inputs[1:]])
+                  for c, idx in node.outputs[0].clients]):
            # if the client is a subtensor with input on gpu or alloc
            replace = True
        if replace and node.inputs[0].dtype != 'float32':
@@ -1424,15 +1430,15 @@ def local_gpu_eye(node):

    eye(host_from_gpu) -> host_from_gpu(gpueye)
    """
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, tensor.Eye) and
            host_input.owner.op.dtype == "float32"):
            return [gpu_eye(*host_input.owner.inputs)]
    if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                      for i in node.inputs]):
+        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
+                for i in node.inputs]):
            return [host_from_gpu(gpu_eye(*node.inputs))]
    return False

@@ -1507,14 +1513,18 @@ def local_gpu_extract_diagonal(node):
    extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
    gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
    """
-    from theano.sandbox import linalg
+    global linalg
+    if linalg is None:
+        from theano.sandbox import linalg
+        linalg = theano.sandbox.linalg
+
    if (isinstance(node.op, linalg.ops.ExtractDiag) and
        isinstance(node.inputs[0].type,
                   theano.tensor.TensorType)):
        inp = node.inputs[0]
        if inp.owner and isinstance(inp.owner.op, HostFromGpu):
            return [host_from_gpu(linalg.extract_diag(gpu_from_host(inp)))]
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, linalg.ops.ExtractDiag) and
@@ -1535,7 +1545,7 @@ def gpuScanOptimization(node):
    """

    #gpu_from_host(scan) -> GPUscan(gpu_from_host)
-    if node.op == gpu_from_host:
+    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, scan_op.Scan) and
@@ -1596,8 +1606,8 @@ def gpuScanOptimization(node):
    #scan(host_from_gpu) -> host_from_gpu(GPUscan)
    if (type(node.op) == scan_op.Scan
        and not node.op.info['gpu']):
-        if numpy.any([(i.owner and i.owner.op == host_from_gpu)
-                      for i in node.inputs]):
+        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
+                for i in node.inputs]):

            thescan = node.op
            info = copy.deepcopy(thescan.info)

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1190,32 +1190,31 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip=True):

    # it also might be the case that there is a dimshuffle between the +
    # and the dot22. local_dot_to_dot22 in particular will put in such things.
-    if M.owner and isinstance(M.owner.op, T.DimShuffle):
+    if (M.owner and isinstance(M.owner.op, T.DimShuffle) and
+        M.owner.inputs[0].owner and
+        isinstance(M.owner.inputs[0].owner.op, Dot22)):
        MM = M.owner.inputs[0]
-        if tuple(M.owner.op.new_order) == (0,):
+        if M.owner.op.new_order == (0,):
            # it is making a column MM into a vector
-            if MM.owner and MM.owner.op == _dot22:
-                MMl, MMr = MM.owner.inputs
-                g = gemm_no_inplace(L.dimshuffle(0, 'x'),
-                        alpha, MMl, MMr, beta)
-                rval = [g.dimshuffle(0)]
-                return rval, MM
-        if tuple(M.owner.op.new_order) == (1,):
+            MMl, MMr = MM.owner.inputs
+            g = gemm_no_inplace(L.dimshuffle(0, 'x'),
+                                alpha, MMl, MMr, beta)
+            rval = [g.dimshuffle(0)]
+            return rval, MM
+        if M.owner.op.new_order == (1,):
            # it is making a row MM into a vector
-            if MM.owner and MM.owner.op == _dot22:
-                MMl, MMr = MM.owner.inputs
-                g = gemm_no_inplace(L.dimshuffle('x', 0),
-                        alpha, MMl, MMr, beta)
-                rval = [g.dimshuffle(1)]
-                return rval, MM
-        if tuple(M.owner.op.new_order) == ():
+            MMl, MMr = MM.owner.inputs
+            g = gemm_no_inplace(L.dimshuffle('x', 0),
+                                alpha, MMl, MMr, beta)
+            rval = [g.dimshuffle(1)]
+            return rval, MM
+        if len(M.owner.op.new_order) == 0:
            # it is making a row MM into a vector
-            if MM.owner and MM.owner.op == _dot22:
-                MMl, MMr = MM.owner.inputs
-                g = gemm_no_inplace(L.dimshuffle('x', 'x'),
-                        alpha, MMl, MMr, beta)
-                rval = [g.dimshuffle()]
-                return rval, MM
+            MMl, MMr = MM.owner.inputs
+            g = gemm_no_inplace(L.dimshuffle('x', 'x'),
+                                alpha, MMl, MMr, beta)
+            rval = [g.dimshuffle()]
+            return rval, MM

    # this is False'd out because of inadequate testing.
    # TODO see ticket #237
@@ -1379,29 +1378,31 @@ def _gemm_from_factored_list(lst):
    """Returns None, or a list to replace node.outputs
    """

-    # Make every pair in list have matching dtypes
-    # sM can be a tuple of 2 elements or a theano variable.
-    # We should not use __len__ as theano variables don't support
-    # it. I don't want to change this to isinstance(sM, tuple)
-    # as I'm not able to make a test that triggers this case.
-    def is_pair(sM):
-        try:
-            s, M = sM
-            return True
-        except Exception:
-            return False
-
    lst2 = []
    # Remove the tuple that can't be cast correctly.
    # This can happen when we try to cast a complex to a real
    for sM in lst:
-        if is_pair(sM):
+        # Make every pair in list have matching dtypes
+        # sM can be a tuple of 2 elements or a theano variable.
+        if isinstance(sM, tuple):
            sm0, sm1 = sM
            sm0 = T.as_tensor_variable(sm0)
            if theano.scalar.upcast(sm0.dtype, sm1.dtype) == sm1.dtype:
                lst2.append((T.cast(sm0, sm1.dtype), sM[1]))
+
    lst = lst2

+    def item_to_var(t):
+        try:
+            s, M = t
+        except Exception:
+            return t
+        if s == 1:
+            return M
+        if s == -1:
+            return -M
+        return s * M
+
    # Try every pair in the sM_list, trying to turn it into a gemm operation
    for i in xrange(len(lst) - 1):
        s_i, M_i = lst[i]
@@ -1418,16 +1419,6 @@ def _gemm_from_factored_list(lst):
                                                              s_j, M_j)
            #print 'GOT IT', gemm_of_sM_list
            if gemm_of_sM_list:
-                def item_to_var(t):
-                    try:
-                        s, M = t
-                    except Exception:
-                        return t
-                    if s == 1:
-                        return M
-                    if s == -1:
-                        return -M
-                    return s * M

                assert len(gemm_of_sM_list) == 1
                add_inputs = [item_to_var(input)