Fix optimizations

aeb8c035 · Xavier Bouthillier · 76b71018 · aeb8c035 · aeb8c035 · aeb8c035
--- a/theano/sandbox/blocksparse.py
+++ b/theano/sandbox/blocksparse.py
@@ -84,7 +84,19 @@ class SparseBlockGemv(Op):
        return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
    def perform(self, node, inp, out_):
-        raise NotImplementedError('Optimization of SparseBlockGemv failed.')
+        o, W, h, iIdx, oIdx = inp[:5]
+        if not self.inplace:
+            o = o.copy()
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                outputIdx = oIdx[b, j]
+                for i in range(h.shape[1]):
+                    inputIdx = iIdx[b, i]
+                    w = W[inputIdx, outputIdx]
+                    o[b, j, :] += numpy.dot(h[b, i], w)
+        out_[0][0] = o
    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
@@ -160,50 +172,6 @@ class SparseBlockOuter(Op):
        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
                     [output])
-    def perform(self, node, inp, out_):
-        raise NotImplementedError('Optimization of SparseBlockOuter failed.')
-    def grad(self, inputs, output_gradients):
-        raise NotImplementedError("SparseBlockOuter has no gradient "
-                                  "implemented")
-class CpuSparseBlockGemv(SparseBlockGemv):
-    """
-    CPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
-    information.
-    This should not be directly called since the interface is subject
-    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
-    function for a stable interface.
-    """
-    def perform(self, node, inp, out_):
-        o, W, h, iIdx, oIdx = inp[:5]
-        if not self.inplace:
-            o = o.copy()
-        for b in range(o.shape[0]):
-            for j in range(o.shape[1]):
-                outputIdx = oIdx[b, j]
-                for i in range(h.shape[1]):
-                    inputIdx = iIdx[b, i]
-                    w = W[inputIdx, outputIdx]
-                    o[b, j, :] += numpy.dot(h[b, i], w)
-        out_[0][0] = o
-class CpuSparseBlockOuter(SparseBlockOuter):
-    """
-    CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
-    information.
-    This op should not be called directly since its interface is
-    subject to change without notice.  It is involved in the gradient
-    of GpuSparseBlockGemv. The gradient is not implemented.
-    """
    def perform(self, node, inp, out_):
        o, x, y, xIdx, yIdx, alpha = inp[:6]
@@ -223,11 +191,6 @@ sparse_block_gemv_inplace = SparseBlockGemv(True)
 sparse_block_outer = SparseBlockOuter(False)
 sparse_block_outer_inplace = SparseBlockOuter(True)
-cpu_sparse_block_gemv = CpuSparseBlockGemv(False)
-cpu_sparse_block_gemv_inplace = CpuSparseBlockGemv(True)
-cpu_sparse_block_outer = CpuSparseBlockOuter(False)
-cpu_sparse_block_outer_inplace = CpuSparseBlockOuter(True)
 def sparse_block_dot(W, h, inputIdx, b, outputIdx):
    """

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -18,10 +18,9 @@ import theano.ifelse
 from six.moves import reduce, xrange
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
-                        Optimizer, toolbox)
+                        Optimizer, TopoOptimizer, toolbox)
 from theano.gof.opt import LocalMetaOptimizer
 from theano.sandbox.cuda import as_cuda_ndarray_variable
-from theano.sandbox.opt import register_meta_opt
 from theano.sandbox.cuda.basic_ops import (
    gpu_eye, gpu_contiguous,
    gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
@@ -32,8 +31,8 @@ from theano.sandbox.cuda.basic_ops import (
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
+from theano.sandbox.cuda.blas import (
-    gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
+    gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
    GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
    GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
@@ -43,11 +42,17 @@ from theano.sandbox.cuda.cula import gpu_solve
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
-from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
+from theano.sandbox.cuda.blas import (
-    GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
+    GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
+    GpuDownsampleFactorMaxGradGrad)
 from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
-from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
+from theano.sandbox.cuda.blocksparse import (
+    GpuSparseBlockGemv,
+    GpuSparseBlockOuter,
+    gpu_sparse_block_gemv_inplace,
+    gpu_sparse_block_outer_inplace)
 from theano.sandbox.cuda.nnet import (
    GpuCrossentropySoftmaxArgmax1HotWithBias,
@@ -84,7 +89,8 @@ gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
                    'fast_run', 'fast_compile', 'inplace', 'gpu')
 gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
                    'fast_run', 'fast_compile', 'gpu')
-# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU!
+# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS
+# enable the GPU!
 optdb.register('gpu_opt',
               gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
@@ -350,8 +356,8 @@ def local_gpu_split(node):
            any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
                 in outs_clients])):
            new_op = GpuSplit(node.op.len_splits)
-            split_res = new_op(as_cuda_ndarray_variable(input), *node.inputs[1:],
+            split_res = new_op(as_cuda_ndarray_variable(input),
-                               return_list=True)
+                               *node.inputs[1:], return_list=True)
            return [host_from_gpu(o) for o in split_res]
    return False
@@ -378,7 +384,8 @@ def local_gpu_dimshuffle_0(node):
            dimshuffle_node = host_input.owner
            new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
                                   dimshuffle_node.op.new_order)
-            return [new_op(as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
+            return [new_op(
+                as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
    return False
@@ -393,8 +400,8 @@ def local_gpu_specifyShape_0(node):
    if isinstance(node.op, tensor.SpecifyShape):
        input = node.inputs[0]
        if input.owner and isinstance(input.owner.op, HostFromGpu):
-            return [host_from_gpu(tensor.specify_shape(as_cuda_ndarray_variable(input),
+            return [host_from_gpu(tensor.specify_shape(
-                                                      *node.inputs[1:]))]
+                as_cuda_ndarray_variable(input), *node.inputs[1:]))]
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
@@ -471,11 +478,15 @@ def local_gpu_dot_to_dot22(node):
                                                shape_out))]
    return False
 @local_optimizer(None)
 def local_assert_no_cpu_op(node):
-    if not isinstance(node.op, GpuOp) and all([var.owner and isinstance(var.owner.op,
+    if (not isinstance(node.op, GpuOp) and
-        HostFromGpu) for var in node.inputs]) and any([[c for c in var.clients
+        all([var.owner and isinstance(var.owner.op, HostFromGpu)
-                if isinstance(c[0].op, GpuFromHost)] for var in node.outputs]):
+             for var in node.inputs]) and
+        any([[c for c in var.clients if isinstance(c[0].op, GpuFromHost)]
+             for var in node.outputs])):
            if config.assert_no_cpu_op == "warn":
                _logger.warning(("CPU op %s is detected in the computational"
                                 " graph") % node)
@@ -576,7 +587,8 @@ def local_gpu_dot22(node):
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.blas.Dot22):
            x, y = host_input.owner.inputs
-            return [gpu_dot22(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))]
+            return [gpu_dot22(as_cuda_ndarray_variable(x),
+                              as_cuda_ndarray_variable(y))]
    if isinstance(node.op, tensor.blas.Dot22):
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):
@@ -601,7 +613,8 @@ def local_gpu_dot22scalar(node):
            isinstance(host_input.owner.op,
                       tensor.blas.Dot22Scalar)):
            x, y, scalar = host_input.owner.inputs
-            return [gpu_dot22scalar(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y),
+            return [gpu_dot22scalar(as_cuda_ndarray_variable(x),
+                                    as_cuda_ndarray_variable(y),
                                    tensor.blas._as_scalar(scalar))]
    if isinstance(node.op, tensor.blas.Dot22Scalar):
        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
@@ -629,7 +642,8 @@ def local_gpu_solve(node):
            isinstance(host_input.owner.op,
                       slinalg.Solve)):
            x, y = host_input.owner.inputs
-            return [gpu_solve(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))]
+            return [gpu_solve(as_cuda_ndarray_variable(x),
+                              as_cuda_ndarray_variable(y))]
    if isinstance(node.op, slinalg.Solve):
        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
@@ -715,8 +729,7 @@ def local_gpu_ger(node):
                    as_cuda_ndarray_variable(z),
                    a,
                    as_cuda_ndarray_variable(x),
-                    as_cuda_ndarray_variable(y)
+                    as_cuda_ndarray_variable(y)))]
-                    ))]
    return False
@@ -745,10 +758,11 @@ def local_gpu_gemm(node):
        y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
        z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
        if x_on_gpu or y_on_gpu or z_on_gpu:
-            return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z),
+            return [host_from_gpu(gpu_gemm_no_inplace(
+                as_cuda_ndarray_variable(z),
                a,
-                                                      gpu_from_host(x),
+                as_cuda_ndarray_variable(x),
-                                                      gpu_from_host(y),
+                as_cuda_ndarray_variable(y),
                b))]
    return False
@@ -886,8 +900,8 @@ def local_gpu_elemwise_careduce(node):
        # automatically add more case, as some like trigonometic
        # operation with some reduction pattern will probably result
        # to slow down.
-        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
+        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
-        ):
        op = node.op
        inp = node.inputs[0].owner.inputs[0]
        return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
@@ -902,7 +916,8 @@ def local_gpu_reshape(node):
           isinstance(host_input.owner.op, tensor.Reshape):
            rshp = host_input.owner.op
            x, shp = host_input.owner.inputs
-            gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x), shp)
+            gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x),
+                                                shp)
            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
                # this can happen as we always return False for all broadcast
                # dim in GpuReshape but not for Reshape
@@ -961,23 +976,27 @@ def local_gpu_subtensor(node):
                # to the GPU in that case.
                return
            coords = host_input.owner.inputs[1:]
-            return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x), *coords)]
+            return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x),
+                                                *coords)]
    if isinstance(node.op, tensor.Subtensor):
        x = node.inputs[0]
        if (x.owner and
            isinstance(x.owner.op, HostFromGpu) and
            x.dtype == "float32"):
            gpu_x = x.owner.inputs[0]
            if (gpu_x.owner and
                isinstance(gpu_x.owner.op, GpuFromHost) and
                # And it is a shared var or an input of the graph.
                not gpu_x.owner.inputs[0].owner):
                if len(x.clients) == 1:
                    if any([n == 'output' or isinstance(n.op, GpuOp)
                            for n, _ in node.outputs[0].clients]):
                        return
                    else:
-                        return [host_from_gpu(as_cuda_ndarray_variable(node.outputs[0]))]
+                        return [host_from_gpu(as_cuda_ndarray_variable(
+                            node.outputs[0]))]
                    return
            gpu_x, = x.owner.inputs
@@ -996,7 +1015,8 @@ def local_gpu_advanced_subtensor1(node):
           host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
            x = host_input.owner.inputs[0]
            coords = host_input.owner.inputs[1:]
-            return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x), *coords)]
+            return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x),
+                                            *coords)]
    if node.op.__class__ is tensor.AdvancedSubtensor1:
        x = node.inputs[0]
        coords = node.inputs[1:]
@@ -1032,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node):
            if (compute_capability < 2 or
                x.ndim != 2 or
                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
            else:
                gpu_op = GpuAdvancedIncSubtensor1_dev20(
                    set_instead_of_inc=set_instead_of_inc)
-            return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
+            return [gpu_op(as_cuda_ndarray_variable(x),
+                           as_cuda_ndarray_variable(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
    if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
@@ -1188,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node):
        nb_monitored_vars = len(node.outputs)
        for i in range(nb_monitored_vars):
-            inp = old_inputs[i+1]
+            inp = old_inputs[i + 1]
            out = old_outputs[i]
            input_is_from_gpu = (inp.owner and
@@ -1256,8 +1278,7 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
                tensor.basic._convert_to_int32,
                tensor.basic._convert_to_int8,
                tensor.basic._convert_to_int16,
-                    tensor.basic._convert_to_int64,
+                tensor.basic._convert_to_int64)
-                    )
            while y.owner and y.owner.op in int_cast_ops:
                y = y.owner.inputs[0]
            gpu_nll, gpu_sm, gpu_am = \
@@ -1307,7 +1328,8 @@ def local_gpu_softmax_with_bias(node):
        x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
        b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
        if x_on_gpu or b_on_gpu:
-            gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(b))
+            gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x),
+                                          as_cuda_ndarray_variable(b))
            return [host_from_gpu(gpu_sm)]
    return False
@@ -1324,6 +1346,7 @@ def _gpu_conv_to_fftconv(node):
    if (node.op.imshp is not None and
        node.op.imshp[-1] is not None and
        node.op.imshp[-1] % 2 == 1):
        kwargs['pad_last_dim'] = True
    # If the user supplied the full nonsymbolic image_shape and
    # filter_shape in conv2d(), we can pass it on to conv2d_fft().
@@ -1337,7 +1360,8 @@ def _gpu_conv_to_fftconv(node):
            (node.op.nkern is not None) and
            (len(node.op.imshp) == 3) and
            (node.op.imshp[0] is not None)):
-        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp
+        kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + \
+            node.op.kshp
    rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
    if node.outputs[0].broadcastable != rval.broadcastable:
        # With given shape information, conv2d_fft may return a different
@@ -1353,6 +1377,7 @@ def local_conv_fft_valid(node):
        if (node.op.border_mode == 'valid' and
            node.op.subsample == (1, 1) and
            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return False
@@ -1363,6 +1388,7 @@ def local_conv_fft_full(node):
        if (node.op.border_mode == 'full' and
            node.op.subsample == (1, 1) and
            node.op.fft_opt):
            return [_gpu_conv_to_fftconv(node)]
        return
@@ -1476,6 +1502,7 @@ def local_gpu_conv(node):
 def local_conv_gemm(node):
    if (isinstance(node.op, GpuConv) and
        node.op.border_mode in ['full', 'valid']):
        img, kern = node.inputs
        border_mode = node.op.border_mode
        subsample = node.op.subsample
@@ -1599,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
            if ((var in inputs) and
                (shape is not None) and
                not any(s is None for s in shape)):
                result[var] = theano.shared(
-# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts
+                    # TODO: Use var.type.filter when cuda_ndarray.filter
-#                        var.type.filter(numpy.random.randn(*shape),
+                    # supports non-strict casts
-#                                        allow_downcast=True),
+                    # var.type.filter(numpy.random.randn(*shape),
+                    # allow_downcast=True),
                    numpy.require(numpy.random.randn(*shape),
                                  dtype=var.dtype),
                    var.name,
@@ -1616,7 +1645,8 @@ conv_metaopt = ConvMetaOptimizer(
    conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
 # Then we add some optimizers that try less obvious options
 conv_metaopt.register(dnn.local_conv_dnn_alternative)
-# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt
+# Finally, we register the metaoptimizer as the first optimizer in
+# conv_groupopt
 conv_groupopt.register('conv_meta', conv_metaopt, 0)
@@ -1661,6 +1691,7 @@ def local_convgrad3d_fft(node):
        return False
    if (isinstance(node.op, ConvGrad3D) and
        (stride_x, stride_y, stride_z) == (1, 1, 1)):
        # we import conv3d_fft locally to avoid pycuda warnings
        from theano.sandbox.cuda.fftconv import conv3d_fft
        # Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
@@ -1747,8 +1778,8 @@ def local_convgrad3d_gemm(node):
        f = node.inputs[3]
        f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
-        rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(x, f,
+        rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(
-                                                               shape=node.inputs[2][1:4])
+            x, f, shape=node.inputs[2][1:4])
        # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
        return [rval.dimshuffle(0, 2, 3, 4, 1)]
@@ -1770,7 +1801,8 @@ def local_convtransp3d_gemm(node):
        # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
        f = node.inputs[3]
        f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
-        rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x, topgrad=f)
+        rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x,
+                                                              topgrad=f)
        # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
@@ -1786,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample
 def local_gpu_downsample_factor_max(node):
    if (isinstance(node.op, downsample.DownsampleFactorMax)
        and node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
        if node.op.padding != (0, 0) or node.op.mode != 'max':
@@ -1801,11 +1834,13 @@ def local_gpu_downsample_factor_max(node):
 def local_gpu_downsample_factor_max_grad(node):
    if (isinstance(node.op, downsample.MaxPoolGrad) and
        node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
                                     'mode')
        if (node.op.padding != (0, 0) or
            node.op.mode != 'max' or
            node.op.st != node.op.ds):
            return
        x, z, gz = node.inputs
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
@@ -1876,7 +1911,8 @@ def local_gpu_join(node):
        # print "OPT: axis_and_tensors=", axis_and_tensors
-        matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or
+        matches = [(t.owner is not None and
+                    isinstance(t.owner.op, HostFromGpu)) or
                   isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
        # print "OPT: matches =", matches
@@ -1884,7 +1920,8 @@ def local_gpu_join(node):
        if all(matches):
            # the extra gpu_from_host introduced here will
            # be removed by further optimizations
-            new_tensors = [as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:]]
+            new_tensors = [as_cuda_ndarray_variable(t)
+                           for t in axis_and_tensors[1:]]
            new_a_and_t = [axis_and_tensors[0]] + new_tensors
            replacement_node = host_from_gpu(gpu_join(*new_a_and_t))
@@ -1941,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt',
 def get_device_type_sizes():
    """
    Returns
    -------
    tuple
@@ -1962,7 +1998,8 @@ def get_device_type_sizes():
        del gpu_int_size
        del t
    except Exception as e:
-        _logger.warning(("Optimization Warning: "
+        _logger.warning((
+            "Optimization Warning: "
            "Got the following error, but you can ignore it. "
            "This could cause less GpuElemwise fused together.\n"
            "%s") % e)
@@ -2037,11 +2074,11 @@ def split_huge_add_or_mul(node):
 # GpuElemwise fusion
 gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
-        GpuElemwise,
+    GpuElemwise, max_inputs_to_GpuElemwise)
-        max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
-    # Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
+    # Must be after cpu fusion at 40, gpu at 48.5 and before
+    # AddDestroyHandler at 49.5
    optdb.register('gpu_elemwise_fusion',
                   tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
                   49, 'fast_run', 'fusion',
@@ -2069,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
    tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
 )
 register_opt()(gpu_elemwise_alloc)
-register_opt()(tensor.opt.local_useless_elemwise) # needed by gpu_elemwise_alloc
+# needed by gpu_elemwise_alloc
+register_opt()(tensor.opt.local_useless_elemwise)
 tensor.opt.register_specialize_device(gpu_elemwise_alloc)
@@ -2115,8 +2153,7 @@ def local_gpualloc(node):
                                    new_out.type.broadcastable):
                assert b_new or (not b_old)
            new_out = tensor.patternbroadcast(new_out, old_out.broadcastable)
-        # if old_out.type != new_out.type:
-            #import pdb; pdb.set_trace()
        return [new_out]
@@ -2139,12 +2176,14 @@ def local_gpualloc_memset_0(node):
        if (isinstance(inp, CudaNdarrayConstant) and
            inp.data.size == 1 and
            (numpy.asarray(inp.data) == 0).all()):
            new_out = GpuAlloc(memset_0=True)(*node.inputs)
            old_bcast = node.outputs[0].type.broadcastable
            if new_out.type.broadcastable != old_bcast:
-                # check that we did not try discarding a broadcastable dimension
+                # check that we did not try discarding a broadcastable
-                assert not any(b_old and not b_new for b_old, b_new in zip(
+                # dimension
-                        old_bcast, new_out.type.broadcastable))
+                assert not any(b_old and not b_new for b_old, b_new in
+                               zip(old_bcast, new_out.type.broadcastable))
                # force old broadcasting pattern; we must not change it here
                new_out = tensor.patternbroadcast(new_out, old_bcast)
            return [new_out]
@@ -2177,6 +2216,7 @@ def local_gpu_eye(node):
        if (host_input.owner and
            isinstance(host_input.owner.op, tensor.Eye) and
            host_input.owner.op.dtype == "float32"):
            return [gpu_eye(*host_input.owner.inputs)]
    if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
@@ -2188,6 +2228,7 @@ def local_gpu_eye(node):
 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):
        return as_cuda_ndarray_variable(x)
    else:
        return x
@@ -2242,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
 def tensor_to_cuda(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):
        y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
        if x.name:
            y.name = x.name + '[cuda]'
@@ -2264,7 +2306,8 @@ def local_gpu_extract_diagonal(node):
                   theano.tensor.TensorType)):
        inp = node.inputs[0]
        if inp.owner and isinstance(inp.owner.op, HostFromGpu):
-            return [host_from_gpu(nlinalg.extract_diag(as_cuda_ndarray_variable(inp)))]
+            return [host_from_gpu(nlinalg.extract_diag(
+                as_cuda_ndarray_variable(inp)))]
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
@@ -2300,6 +2343,7 @@ def gpuScanOptimization(node):
            isinstance(host_input.owner.op, scan_op.Scan) and
            not host_input.owner.op.info['gpu'] and
            len(host_input.owner.outputs) == 1):
            # Note that we are not doing the right thing here !!
            # This is because the local optimizer expects only one
            # output that corresponds to the input of ``node``
@@ -2353,6 +2397,7 @@ def gpuScanOptimization(node):
    # scan(host_from_gpu) -> host_from_gpu(GPUscan)
    if (type(node.op) == scan_op.Scan
        and not node.op.info['gpu']):
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):
@@ -2434,7 +2479,8 @@ optdb.register('gpu_scanOp_make_inplace',
 #    @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
 #    def local_merge_blocksparse_alpha(node, *inputs):
 #        """
-# GpuElemwise{mul}(lr, GpuSparseBlockOuter) -> GpuSparseBlockOuter(..., alpha=lr)
+#            GpuElemwise{mul}(lr, GpuSparseBlockOuter) ->
+#                GpuSparseBlockOuter(..., alpha=lr)
 #        """
 #        return [gpu_sparse_block_outer(*inputs)]
@@ -2465,8 +2511,7 @@ def _clear_host_from_gpu(inputs):
    return clean_inputs
-@register_meta_opt(SparseBlockGemv, ["gpu_opt", "gpu_local_optimizations"],
+@register_opt()
-                   0., 'fast_run', 'fast_compile', 'gpu')
 @local_optimizer([SparseBlockGemv, GpuFromHost])
 def gpu_sparse_block_gemv_opt(node):
    """
@@ -2493,8 +2538,7 @@ def gpu_sparse_block_gemv_opt(node):
        return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)]
-@register_meta_opt(SparseBlockOuter, ["gpu_opt", "gpu_local_optimizations"],
+@register_opt()
-                   0., 'fast_run', 'fast_compile', 'gpu')
 @local_optimizer([SparseBlockOuter, GpuFromHost])
 def gpu_sparse_block_outer_opt(node):
    """
@@ -2522,4 +2566,36 @@ def gpu_sparse_block_outer_opt(node):
        return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
+@local_optimizer([GpuSparseBlockGemv], inplace=True)
+def local_inplace_gpu_sparse_block_gemv(node):
+    """
+        GpuSparseBlockGemv(inplace=False) -> GpuSparseBlockGemv(inplace=True)
+    """
+    if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
+        new_node = gpu_sparse_block_gemv_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_gpu_sparse_block_gemv',
+                       TopoOptimizer(
+                           local_inplace_gpu_sparse_block_gemv,
+                           failure_callback=TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace', 'gpu')  # DEBUG
+@local_optimizer([GpuSparseBlockOuter], inplace=True)
+def local_inplace_gpu_sparse_block_outer(node):
+    """
+        GpuSparseBlockOuter(inplace=False) -> GpuSparseBlockOuter(inplace=True)
+    """
+    if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
+        new_node = gpu_sparse_block_outer_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_gpu_sparse_block_outer',
+                       TopoOptimizer(
+                           local_inplace_gpu_sparse_block_outer,
+                           failure_callback=TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace', 'gpu')  # DEBUG
 import theano.sandbox.cuda.extra_ops
--- a/theano/sandbox/opt.py
+++ b/theano/sandbox/opt.py
@@ -2,170 +2,42 @@
 Optimizations addressing the ops in sandbox root directory
 """
-import bisect
+from theano import compile  # to register the optimizer built by this file
-import logging
+from theano import gof
-from theano.compile import optdb
-from theano.gof import local_optimizer, EquilibriumDB
-from theano.tensor.opt import register_specialize
 from theano.sandbox.blocksparse import (
    SparseBlockGemv,
    SparseBlockOuter,
-    sparse_block_gemv,
-    sparse_block_outer,
    sparse_block_gemv_inplace,
-    sparse_block_outer_inplace,
+    sparse_block_outer_inplace)
-    CpuSparseBlockGemv,
-    CpuSparseBlockOuter)
-_logger = logging.getLogger('theano.sandbox.opt')
-def _db_exists(db, db_name):
-    """
-        Tests whether the full path from `db_name[0]` down to
-        `db_name[-1]` exists.
-        Parameters
-        ----------
-        db: `theano.gof.optdb.DB`
-            A dataset of optimisations or sub-datasets.
-        db_name: list or tuple of strings
-            Names of datasets from given one `db[db_name[0]]` down
-            to the dataset of interest where to register.
-            ex: ['level_1_dataset', 'level_2_dataset']
-    """
-    if len(db_name) == 1:
-        return db_name[0] in db._names
-    return db_name[0] in db._names and _db_exists(db[db_name[0]], db_name[1:])
-def _db_register(db, db_name, *args):
-    """
-        Registers an object in last datasets given in db_name. `db_name[-1]`
-        is deep in the hierarchy of `db`.
-        Parameters
-        ----------
-        db: `theano.gof.optdb.DB`
-            A dataset of optimisations or sub-datasets.
-        db_name: list or tuple of strings
-            Names of datasets from given one `db[db_name[0]]` down
-            to the dataset of interest where to register.
-            ex: ['level_1_dataset', 'level_2_dataset']
-    """
-    if len(db_name) == 0:
-        return db.register(*args)
-    return _db_register(db[db_name[0]], db_name[1:], *args)
-def _db_positions(db, db_name, positions=()):
-    """
-        Returns the list of positions of all databases from `db_name[0]`
-        down to `db_name[-1]`. The path is hierarchical, hence `db_name[0]`
-        is in `db`, `db_name[1]` is in `db[db_name[0]]`, etc.
-        Parameters
-        ----------
-        db: `theano.gof.optdb.DB`
-            A dataset of optimisations or sub-datasets.
-        db_name: list or tuple of strings
-            Names of datasets from given one `db[db_name[0]]` down
-            to the dataset of interests.
-            ex: ['level_1_dataset', 'level_2_dataset']
-    """
-    if len(db_name) == 0:
-        return positions
-    db_position = db.__position__.get(db_name[0], 0.)
-    return _db_positions(db[db_name[0]], db_name[1:],
-                         positions + (db_position, ))
-def register_meta_opt(op_class, db_name, position, *args):
-    """
-    Registers a given optimization under given database name and saves
-    optimization information in `op_class.registered_opts`.
-    Parameters
-    ----------
-    op_class: `theano.gof.Op`
-        A meta Op which have multiple implementations available
-        for optimization.
-    db_name: string, list or tuple of strings
-        A string if optimization is inserted in `theano.compile.optdb`
-        directly. List is used to insert an optimization deep inside a
-        hierarchy of optimization databases.
-    position: int or float
-        Position of the optimisation in the target dataset.
-        (Position in deep database if not optdb)
-    *args
-        Arguments to register the optimization.
-    """
-    if isinstance(db_name, str):
-        db_name = [db_name]
-    def call(local_meta_opt):
-        if not _db_exists(optdb, db_name):
-            # TODO: Would another default DB be better?
-            _db_register(optdb, db_name[:-2],
-                         db_name[-1], EquilibriumDB(), position, *args)
-        _db_register(optdb, db_name,
-                     local_meta_opt.__name__, local_meta_opt, *args)
-        positions = _db_positions(optdb, db_name)
-        idx = bisect.bisect_left((positions, local_meta_opt),
-                                 op_class.registered_opts)
-        op_class.registered_opts.insert(idx,
-                                        (positions, local_meta_opt.__name__))
-        return local_meta_opt
-    return call
-@register_meta_opt(SparseBlockGemv, ["meta_cpu"], 51.0,
-                   "fast_run", "fast_compile")
-@local_optimizer([SparseBlockGemv])
-def cpu_sparse_block_gemv_opt(node):
-    """
-        SparseBlockGemv -> CpuSparseBlockGemv
-    """
-    return [CpuSparseBlockGemv(node.op.inplace)(*node.inputs)]
-@register_meta_opt(SparseBlockOuter, ["meta_cpu"], 51.0,
-                   "fast_run", "fast_compile")
-@local_optimizer([SparseBlockOuter])
-def cpu_sparse_block_outer_opt(node):
-    """
-        SparseBlockOuter -> CpuSparseBlockOuter
-    """
-    return [CpuSparseBlockOuter(node.op.inplace)(*node.inputs)]
-@register_specialize
+@gof.local_optimizer([SparseBlockGemv], inplace=True)
-@local_optimizer([sparse_block_gemv], inplace=True)
+def local_inplace_sparse_block_gemv(node):
-def local_inplace_block_sparse_gemv(node):
    """
        SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
    """
-    return [sparse_block_gemv_inplace(*node.inputs)]
+    if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
+        new_node = sparse_block_gemv_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_sparse_block_gemv',
+                       gof.TopoOptimizer(
+                           local_inplace_sparse_block_gemv,
+                           failure_callback=gof.TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace')  # DEBUG
-@register_specialize
+@gof.local_optimizer([SparseBlockOuter], inplace=True)
-@local_optimizer([sparse_block_outer], inplace=True)
+def local_inplace_sparse_block_outer(node):
-def local_inplace_block_sparse_outer(node):
    """
        SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
    """
-    return [sparse_block_outer_inplace(*node.inputs)]
+    if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
+        new_node = sparse_block_outer_inplace(*node.inputs)
+        return [new_node]
+    return False
+compile.optdb.register('local_inplace_sparse_block_outer',
+                       gof.TopoOptimizer(
+                           local_inplace_sparse_block_outer,
+                           failure_callback=gof.TopoOptimizer.warn_inplace),
+                       60, 'fast_run', 'inplace')  # DEBUG
--- a/theano/sandbox/tests/test_blocksparse.py
+++ b/theano/sandbox/tests/test_blocksparse.py
@@ -11,7 +11,7 @@ from theano import tensor
 import theano.tests.unittest_tools as utt
 from theano.sandbox.blocksparse import sparse_block_dot, \
-    cpu_sparse_block_gemv, cpu_sparse_block_outer
+    sparse_block_gemv, sparse_block_outer
 class BlockSparse_Gemv_and_Outer(unittest.TestCase):
@@ -24,8 +24,8 @@ class BlockSparse_Gemv_and_Outer(unittest.TestCase):
        self.mode = theano.compile.get_default_mode().excluding(
            'constant_folding'
        )
-        self.gemv_op = cpu_sparse_block_gemv
+        self.gemv_op = sparse_block_gemv
-        self.outer_op = cpu_sparse_block_outer
+        self.outer_op = sparse_block_outer
    @staticmethod
    def gemv_data():

--- a/theano/sandbox/tests/test_opt.py
+++ b/theano/sandbox/tests/test_opt.py
 import theano
 from theano import tensor
-from theano.sandbox.blocksparse import CpuSparseBlockGemv, \
+from theano.sandbox.blocksparse import sparse_block_dot
-    CpuSparseBlockOuter, sparse_block_dot
-def test_blocksparse_cpu_gemv_opt():
+def test_blocksparse_inplace_gemv_opt():
    b = tensor.fmatrix()
    W = tensor.ftensor4()
    h = tensor.ftensor3()
@@ -15,10 +14,13 @@ def test_blocksparse_cpu_gemv_opt():
    f = theano.function([W, h, iIdx, b, oIdx], o)
-    assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockGemv)
+    if theano.config.mode == "FAST_COMPILE":
+        assert not f.maker.fgraph.toposort()[-1].op.inplace
+    else:
+        assert f.maker.fgraph.toposort()[-1].op.inplace
-def test_blocksparse_cpu_outer_opt():
+def test_blocksparse_inplace_outer_opt():
    b = tensor.fmatrix()
    W = tensor.ftensor4()
    h = tensor.ftensor3()
@@ -32,4 +34,7 @@ def test_blocksparse_cpu_outer_opt():
    f = theano.function([W, h, iIdx, b, oIdx],
                        [o, tensor.grad(o.sum(), wrt=W)])
-    assert isinstance(f.maker.fgraph.toposort()[-1].op, CpuSparseBlockOuter)
+    if theano.config.mode == "FAST_COMPILE":
+        assert not f.maker.fgraph.toposort()[-1].op.inplace
+    else:
+        assert f.maker.fgraph.toposort()[-1].op.inplace