Made the stricter optimization consistent

8ea065e6 · sentient07 · 8dfe6847 · 8ea065e6 · 8ea065e6
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -23,7 +23,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
-from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
+from .type import get_context, gpu_context_type, list_contexts
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, gpu_alloc_empty,
                        empty_like)
@@ -1428,21 +1428,18 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):


 @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'conv_dnn', 'cudnn', 'gpuarray', 'fast_compile')
+                AbstractConv2d_gradInputs], 'fast_compile')
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,
                            AbstractConv2d_gradInputs))):
-        return None
-
-    inp1 = inputs[0]
-    inp2 = inputs[1]
+        return

-    if (node.op.filter_dilation != (1, 1)):
+    if (op.filter_dilation != (1, 1)):
        return None

-    if not isinstance(inp1.type, GpuArrayType):
-        return None
+    inp1 = as_gpuarray_variable(inputs[0], context_name)
+    inp2 = as_gpuarray_variable(inputs[1], context_name)

    if not dnn_available(inp1.type.context_name):
        raise_no_cudnn()

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -30,7 +30,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tests.breakpoint import PdbBreakpoint

 from .type import (GpuArrayType, GpuArrayConstant, get_context,
-                   ContextNotDefined, GpuArrayVariable, GpuArraySharedVariable)
+                   ContextNotDefined)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        host_from_gpu, GpuToGpu,
                        HostFromGpu, GpuFromHost,
@@ -291,10 +291,9 @@ class GraphToGPU(NavigatorOptimizer):
        target = infer_context_name(*fgraph.inputs)
        for i in fgraph.inputs:
            # Do not move *int* scalar to the GPU.
-            target = getattr(i.tag, 'target', None)
            if (isinstance(i.type, tensor.TensorType) and
               (i.ndim > 0 or 'int' not in i.dtype)):
-                mapping[i] = i.transfer(target)
+                mapping[i] = as_gpuarray_variable(i, target)
            else:
                mapping[i] = i
        for i in fgraph.variables:
@@ -316,12 +315,6 @@ class GraphToGPU(NavigatorOptimizer):

            # Move only if any of the inputs are on the GPU.
            move_to_GPU = False
-            if any([isinstance(i, GpuArrayVariable) or
-                   isinstance(i, GpuArraySharedVariable)
-                   for i in [mapping[v] for v in node.inputs] +
-                   node.outputs]):
-
-                move_to_GPU = True

            context_name = None
            for i in [mapping[i] for i in node.inputs]:
@@ -346,20 +339,20 @@ class GraphToGPU(NavigatorOptimizer):
            new_ops = None
            outputs = []
            # Apply the lifter
-            for lopt in (self.local_optimizers_map.get(node.op, []) +
-                         self.local_optimizers_map.get(type(node.op), []) +
-                         self.local_optimizers_all):
-                if move_to_GPU:
-                    t_opt = time.time()
-                    new_ops = lopt.transform(node.op, context_name,
-                                             [mapping[i] for i in node.inputs],
-                                             node.outputs)
-                    t_opt2 = time.time()
-                    time_opts[lopt] += t_opt2 - t_opt
-
-                    if new_ops:
-                        process_count[lopt] += 1
-                        break
+            if move_to_GPU:
+                for lopt in (self.local_optimizers_map.get(node.op, []) +
+                             self.local_optimizers_map.get(type(node.op), []) +
+                             self.local_optimizers_all):
+                        t_opt = time.time()
+                        new_ops = lopt.transform(node.op, context_name,
+                                                 [mapping[i] for i in node.inputs],
+                                                 node.outputs)
+                        t_opt2 = time.time()
+                        time_opts[lopt] += t_opt2 - t_opt
+
+                        if new_ops:
+                            process_count[lopt] += 1
+                            break
            if not new_ops:
                newnode = node.clone_with_new_inputs([mapping.get(i)
                                                      for i in node.inputs])
@@ -754,7 +747,7 @@ def local_gpua_dimshuffle(op, context_name, inputs, outputs):

 @register_opt('fast_compile')
 @op_lifter([tensor.SpecifyShape])
-@register_opt2([tensor.SpecifyShape], 'fast_compile')
+# @register_opt2([tensor.SpecifyShape], 'fast_compile')
 def local_gpua_specifyShape(op, context_name, inputs, outputs):
    if isinstance(inputs[0].type, GpuArrayType):
        return
@@ -763,9 +756,15 @@ def local_gpua_specifyShape(op, context_name, inputs, outputs):
    return tensor.specify_shape(*inp)


+@register_opt2([tensor.SpecifyShape], 'fast_compile')
+def local_gpua_specifyShape_graph(op, context_name, inputs, outputs):
+    inp = [as_gpuarray_variable(inputs[0], context_name)]
+    inp += inputs[1:]
+    return tensor.specify_shape(*inp)
+
+
 @register_opt('fast_compile')
 @op_lifter([theano.compile.ops.Shape])
-@register_opt2([tensor.compile.ops.Shape], 'fast_compile')
 def local_gpua_shape(op, context_name, inputs, outputs):
    # op_lifter will call this opt too frequently as the output is
    # always on the CPU.
@@ -774,6 +773,13 @@ def local_gpua_shape(op, context_name, inputs, outputs):
    return [as_gpuarray_variable(inputs[0], context_name).shape]


+@register_opt2([tensor.compile.ops.Shape], 'fast_compile')
+def local_gpua_shape_graph(op, context_name, inputs, outputs):
+    # op_lifter will call this opt too frequently as the output is
+    # always on the CPU.
+    return [as_gpuarray_variable(inputs[0], context_name).shape]
+
+
 def gpu_print_wrapper(op, cnda):
    op.old_op.global_fn(op.old_op, numpy.asarray(cnda))

@@ -863,15 +869,10 @@ def local_gpu_pdbbreakpoint_op(node):
 def local_gpua_lazy_ifelse(op, context_name, inputs, outputs):
    if op.gpu:
        return
-    # this node is already on GPU, so don't change the graph
-    if isinstance(inputs[0].type, GpuArrayType):
-        return
    c = inputs[0]
    inps = []
    for v in inputs[1:]:
-        if isinstance(v.type, GpuArrayType):
-            return
-        elif isinstance(v.type, tensor.TensorType):
+        if isinstance(v.type, tensor.TensorType):
            inps.append(as_gpuarray_variable(v, context_name))
        else:
            inps.append(v)
@@ -1230,15 +1231,19 @@ def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):

 @register_opt('fast_compile')
 @op_lifter([theano.tensor.opt.Assert])
-@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
 def local_assert(op, context_name, inputs, outputs):
-    # Check if input nodes are already on the GPU
    if isinstance(inputs[0].type, GpuArrayType):
        return
    return [op(as_gpuarray_variable(inputs[0], context_name),
               *inputs[1:])]


+@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
+def local_assert_graph(op, context_name, inputs, outputs):
+    return [op(as_gpuarray_variable(inputs[0], context_name),
+               *inputs[1:])]
+
+
 @register_opt('fast_compile')
 @op_lifter([ConvOp])
 @register_opt2([ConvOp], 'fast_compile')
@@ -1286,15 +1291,12 @@ def local_inplace_sparseblockouter(node):


 # This deals with any abstract convs that have a transfer somewhere
-@register_opt('fast_compile')
+@register_opt('fast_compile', 'conv_dnn')
 @op_lifter([AbstractConv2d,
            AbstractConv2d_gradWeights,
            AbstractConv2d_gradInputs])
-@register_opt2([AbstractConv2d,
-                AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile')
 def local_lift_abstractconv2d(op, context_name, inputs, outputs):
-    if isinstance(inputs[0].type, GpuArrayType):
+    if isinstance(outputs[0].type, GpuArrayType):
        # Don't handle this node here, it's already on the GPU.
        return
    inps = list(inputs)
@@ -1304,6 +1306,18 @@ def local_lift_abstractconv2d(op, context_name, inputs, outputs):
                                   context_name=context_name)
    return [op(*inps)]

+
+@register_opt2([AbstractConv2d,
+                AbstractConv2d_gradWeights,
+                AbstractConv2d_gradInputs], 'fast_compile')
+def local_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
+    inps = list(inputs)
+    inps[0] = as_gpuarray_variable(inputs[0],
+                                   context_name=context_name)
+    inps[1] = as_gpuarray_variable(inputs[1],
+                                   context_name=context_name)
+    return [op(*inps)]
+
 # Register this here so that it goes after the abstract lifting
 register_opt('fast_compile')(conv_groupopt)