Made suggested changes, fixed travis

4ebe109a · sentient07 · 643f5b1e · 4ebe109a · 4ebe109a · 4ebe109a
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -870,7 +870,7 @@ class GpuAlloc(HideC, Alloc):
        return True
-#Caching GPUAlloc 
+# Caching GPUAlloc
 def gpu_alloc(ctx, memset_0=False):
    key = (ctx, memset_0)
    if key not in gpu_alloc.cache:

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1427,10 +1427,10 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])
-@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
+@op_lifter([AbstractConv2d, AbstractConv2d_gradWeights,
            AbstractConv2d_gradInputs])
 @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile')
+                AbstractConv2d_gradInputs], 'conv_dnn', 'cudnn', 'gpuarray', 'fast_compile')
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -474,4 +474,3 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
 def use_gpu_images2neibs(op, context_name, inputs):
    if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
        return GpuImages2Neibs(op.mode)
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -305,12 +305,12 @@ class GraphToGPU(NavigatorOptimizer):
        for node in topo:
            if isinstance(node.op, HostFromGpu):
-                mapping[node.outputs[0]] = node.inputs[0]
+                mapping[node.outputs[0]] = mapping[node.inputs[0]]
                continue
            # Move only if any of the inputs are on the GPU.
            move_to_GPU = False
-            from .type import GpuArrayVariable, GpuArraySharedVariable #when you uncomment
+            from .type import GpuArrayVariable, GpuArraySharedVariable
            if any([isinstance(i, GpuArrayVariable) or
                   isinstance(i, GpuArraySharedVariable)
                   for i in [mapping[v] for v in node.inputs] +
@@ -364,14 +364,8 @@ class GraphToGPU(NavigatorOptimizer):
            elif isinstance(new_ops, (tuple, list)):
                outputs = []
                for o in new_ops:
-                    if o.owner and isinstance(o.owner.op, HostFromGpu):
-                        outputs.append(o.owner.inputs[0])
-                    else:
                    outputs.append(o)
            elif isinstance(new_ops, theano.Variable):
-                if new_ops.owner and isinstance(new_ops.owner.op, HostFromGpu):
-                    outputs = new_ops.owner.inputs
-                else:
                outputs = [new_ops]
            else:
                outputs = new_ops(*[mapping[i] for i in node.inputs],
@@ -427,9 +421,9 @@ class GraphToGPU(NavigatorOptimizer):
        for s in list(set(old_not_transferred)):
            print(blanc, 'Nodes not transferred by old opt : ' + str(s), file=stream)
        for n in list(set(new_not_transferred)):
-            print(blanc, 'Nodes not transferred by new optimizer : ' +str(n), file=stream)
+            print(blanc, 'Nodes not transferred by new optimizer : ' + str(n), file=stream)
        for d in list(set(set(new_not_transferred) - set(old_not_transferred))):
-            print(blanc, 'Not transferred difference : ' , str(d), file=stream)
+            print(blanc, 'Not transferred difference : ', str(d), file=stream)
        for o, count in iteritems(process_count):
            if count > 0:
@@ -592,7 +586,7 @@ def local_gpuaallocempty(op, context_name, inputs, outputs):
    # We use _props_dict() to make sure that the GPU op know all the
    # CPU op props.
    dtype = op._props_dict().get('dtype')
-    return gpu_alloc_empty(dtype,context_name)(*inputs)
+    return gpu_alloc_empty(dtype, context_name)(*inputs)
 @register_opt()
@@ -889,11 +883,12 @@ def local_gpua_join(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
 @local_optimizer([GpuJoin])
-def local_gpuajoin_1(node):
+@register_opt2([GpuJoin], 'fast_compile')
+def local_gpuajoin_1(op, context_name, inputs, outputs):
    # join of a single element
-    if (isinstance(node.op, GpuJoin) and
+    if (isinstance(op, GpuJoin) and
-            len(node.inputs) == 2):
+            len(inputs) == 2):
-        return [node.inputs[1]]
+        return [inputs[1]]
 @register_opt('fast_compile')
@@ -1311,7 +1306,7 @@ def local_lift_abstractconv2d(op, context_name, inputs, outputs):
 register_opt('fast_compile')(conv_groupopt)
-@register_opt("low_memory")
+@register_opt("low_memory", 'fast_compile')
 @local_optimizer([GpuCAReduceCuda])
 def local_gpu_elemwise_careduce(node):
    """

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -1553,7 +1553,7 @@ class MRG_RandomStreams(object):
 @register_opt2([mrg_uniform], 'fast_compile')
-def local_gpua_mrg(op, context_name, inputs, outputs):
+def local_gpua_mrg1(op, context_name, inputs, outputs):
    if (type(op) == mrg_uniform and
            isinstance(inputs[0].type, GpuArrayType)):
        outs = GPUA_mrg_uniform.new(inputs[0],