Made suggested changes, fixed travis

4ebe109a · sentient07 · 643f5b1e · 4ebe109a · 4ebe109a · 4ebe109a
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -870,7 +870,7 @@ class GpuAlloc(HideC, Alloc):
        return True


-#Caching GPUAlloc 
+# Caching GPUAlloc
 def gpu_alloc(ctx, memset_0=False):
    key = (ctx, memset_0)
    if key not in gpu_alloc.cache:

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -968,7 +968,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode, precision=precision)(kerns.shape)
+                             conv_mode=conv_mode, precision=precision)(kerns.shape)
    desc_op = desc.owner.op
    # We can use Shape_i and bypass the infer_shape here as this is on
    # the input of node and it will always be present.
@@ -990,7 +990,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    topgrad = gpu_contiguous(topgrad)
    kerns_shp = as_tensor_variable(kerns_shp)
    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(kerns_shp)
+                             conv_mode=conv_mode)(kerns_shp)
    out = gpu_alloc_empty(img.dtype, ctx_name)(*kerns_shp)
    return gpu_dnn_conv_gradW()(img, topgrad, out, desc)

@@ -1004,7 +1004,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    topgrad = gpu_contiguous(topgrad)
    img_shp = as_tensor_variable(img_shp)
    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(kerns.shape)
+                             conv_mode=conv_mode)(kerns.shape)
    out = gpu_alloc_empty(kerns.dtype, ctx_name)(*img_shp)
    return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)

@@ -1427,10 +1427,10 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])


-@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
-                  AbstractConv2d_gradInputs])
+@op_lifter([AbstractConv2d, AbstractConv2d_gradWeights,
+            AbstractConv2d_gradInputs])
 @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile')
+                AbstractConv2d_gradInputs], 'conv_dnn', 'cudnn', 'gpuarray', 'fast_compile')
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -474,4 +474,3 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
 def use_gpu_images2neibs(op, context_name, inputs):
    if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
        return GpuImages2Neibs(op.mode)
-
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -285,9 +285,9 @@ class GraphToGPU(NavigatorOptimizer):
        for i in fgraph.inputs:
            # Do not move *int* scalar to the GPU.
            target = getattr(i.tag, 'target', None)
-            if (target != 'cpu' and 
-                isinstance(i.type, tensor.TensorType) and
-                (i.ndim > 0 or 'int' not in i.dtype)):
+            if (target != 'cpu' and
+               isinstance(i.type, tensor.TensorType) and
+               (i.ndim > 0 or 'int' not in i.dtype)):
                mapping[i] = as_gpuarray_variable(i, target)
            else:
                mapping[i] = i
@@ -305,12 +305,12 @@ class GraphToGPU(NavigatorOptimizer):
        for node in topo:

            if isinstance(node.op, HostFromGpu):
-                mapping[node.outputs[0]] = node.inputs[0]
+                mapping[node.outputs[0]] = mapping[node.inputs[0]]
                continue

            # Move only if any of the inputs are on the GPU.
            move_to_GPU = False
-            from .type import GpuArrayVariable, GpuArraySharedVariable #when you uncomment
+            from .type import GpuArrayVariable, GpuArraySharedVariable
            if any([isinstance(i, GpuArrayVariable) or
                   isinstance(i, GpuArraySharedVariable)
                   for i in [mapping[v] for v in node.inputs] +
@@ -364,15 +364,9 @@ class GraphToGPU(NavigatorOptimizer):
            elif isinstance(new_ops, (tuple, list)):
                outputs = []
                for o in new_ops:
-                    if o.owner and isinstance(o.owner.op, HostFromGpu):
-                        outputs.append(o.owner.inputs[0])
-                    else:
-                        outputs.append(o)
+                    outputs.append(o)
            elif isinstance(new_ops, theano.Variable):
-                if new_ops.owner and isinstance(new_ops.owner.op, HostFromGpu):
-                    outputs = new_ops.owner.inputs
-                else:
-                    outputs = [new_ops]
+                outputs = [new_ops]
            else:
                outputs = new_ops(*[mapping[i] for i in node.inputs],
                                  return_list=True)
@@ -427,9 +421,9 @@ class GraphToGPU(NavigatorOptimizer):
        for s in list(set(old_not_transferred)):
            print(blanc, 'Nodes not transferred by old opt : ' + str(s), file=stream)
        for n in list(set(new_not_transferred)):
-            print(blanc, 'Nodes not transferred by new optimizer : ' +str(n), file=stream)
+            print(blanc, 'Nodes not transferred by new optimizer : ' + str(n), file=stream)
        for d in list(set(set(new_not_transferred) - set(old_not_transferred))):
-            print(blanc, 'Not transferred difference : ' , str(d), file=stream)
+            print(blanc, 'Not transferred difference : ', str(d), file=stream)

        for o, count in iteritems(process_count):
            if count > 0:
@@ -592,7 +586,7 @@ def local_gpuaallocempty(op, context_name, inputs, outputs):
    # We use _props_dict() to make sure that the GPU op know all the
    # CPU op props.
    dtype = op._props_dict().get('dtype')
-    return gpu_alloc_empty(dtype,context_name)(*inputs)
+    return gpu_alloc_empty(dtype, context_name)(*inputs)


 @register_opt()
@@ -614,7 +608,7 @@ def local_gpua_alloc_empty_to_zeros(node):
        context_name = infer_context_name(*node.inputs)
        z = numpy.asarray(0, dtype=node.outputs[0].dtype)
        return [gpu_alloc(None)(as_gpuarray_variable(z, context_name),
-                           *node.inputs)]
+                                *node.inputs)]
 optdb.register('local_gpua_alloc_empty_to_zeros',
               theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
               # After move to gpu and merge2, before inplace.
@@ -889,11 +883,12 @@ def local_gpua_join(op, context_name, inputs, outputs):

 @register_opt('fast_compile')
 @local_optimizer([GpuJoin])
-def local_gpuajoin_1(node):
+@register_opt2([GpuJoin], 'fast_compile')
+def local_gpuajoin_1(op, context_name, inputs, outputs):
    # join of a single element
-    if (isinstance(node.op, GpuJoin) and
-            len(node.inputs) == 2):
-        return [node.inputs[1]]
+    if (isinstance(op, GpuJoin) and
+            len(inputs) == 2):
+        return [inputs[1]]


 @register_opt('fast_compile')
@@ -1311,7 +1306,7 @@ def local_lift_abstractconv2d(op, context_name, inputs, outputs):
 register_opt('fast_compile')(conv_groupopt)


-@register_opt("low_memory")
+@register_opt("low_memory", 'fast_compile')
 @local_optimizer([GpuCAReduceCuda])
 def local_gpu_elemwise_careduce(node):
    """

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -1553,7 +1553,7 @@ class MRG_RandomStreams(object):


 @register_opt2([mrg_uniform], 'fast_compile')
-def local_gpua_mrg(op, context_name, inputs, outputs):
+def local_gpua_mrg1(op, context_name, inputs, outputs):
    if (type(op) == mrg_uniform and
            isinstance(inputs[0].type, GpuArrayType)):
        outs = GPUA_mrg_uniform.new(inputs[0],