Changed interface of gpu_alloc_empty and few cleanups

6701568f · sentient07 · 8f602f6f · 6701568f · 6701568f · 6701568f
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -973,7 +973,22 @@ def empty_like(var):
    return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)


-def gpu_alloc_empty(dtype, ctx):
+def gpu_alloc_empty(ctx, **kwargs):
+    '''
+    This is the cache method of GpuAllocEmpty class.
+    This takes the parameters of context name and props_dict
+    and retrieves the dtype key from the dictionary
+
+    Parameters
+    ----------
+    ctx : String
+        The context name.
+
+    kwargs : Dict
+        The props_dict of the Op
+
+    '''
+    dtype = kwargs.get('dtype')
    key = (dtype, ctx)
    if key not in gpu_alloc_empty.cache:
        gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -937,7 +937,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = gpu_alloc_empty(img.dtype, ctx_name)(
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
            shape_i(kerns, 1, fgraph),
            shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
@@ -955,7 +955,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = gpu_alloc_empty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
                                                   shape_i(kerns, 1, fgraph),
                                                   shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
@@ -977,7 +977,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
                                    desc_op.subsample)
-    out = gpu_alloc_empty(img.dtype, ctx_name)(*out_shp)
+    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)


@@ -991,7 +991,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    kerns_shp = as_tensor_variable(kerns_shp)
    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
                             conv_mode=conv_mode)(kerns_shp)
-    out = gpu_alloc_empty(img.dtype, ctx_name)(*kerns_shp)
+    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
    return gpu_dnn_conv_gradW()(img, topgrad, out, desc)


@@ -1005,7 +1005,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    img_shp = as_tensor_variable(img_shp)
    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
                             conv_mode=conv_mode)(kerns.shape)
-    out = gpu_alloc_empty(kerns.dtype, ctx_name)(*img_shp)
+    out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
    return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)


@@ -1480,7 +1480,7 @@ def local_abstractconv_cudnn(node):
        return
    return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)

-conv_groupopt.register('local_abstractconv_cudnn_graph',
+conv_groupopt.register('local_abstractconv_cudnn',
                       local_abstractconv_cudnn, 20,
                       'fast_compile', 'fast_run',
                       'gpuarray', 'conv_dnn', 'cudnn')
@@ -1549,7 +1549,7 @@ def local_dnn_convi_output_merge(node, *inputs):

 @register_opt('cudnn', 'fast_compile')
 @op_lifter([Pool])
-@register_opt2([Pool], 'fast_compile')
+@register_opt2([Pool], 'fast_compile', 'cudnn')
 def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
@@ -1566,7 +1566,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):

 @register_opt('cudnn', 'fast_compile')
 @op_lifter([MaxPoolGrad])
-@register_opt2([MaxPoolGrad], 'fast_compile')
+@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
 def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
@@ -1591,7 +1591,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):

 @register_opt('cudnn', 'fast_compile')
 @op_lifter([AveragePoolGrad])
-@register_opt2([AveragePoolGrad], 'fast_compile')
+@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
 def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
@@ -1645,7 +1645,7 @@ def local_log_softmax_dnn(node):

 @register_opt('cudnn', 'fast_compile')
 @op_lifter([LogSoftmax])
-@register_opt2([LogSoftmax], 'fast_compile')
+@register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
 def local_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    # Transform the input in the format expected by GpuDnnSoftmax
    inp = inputs[0]

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -3,6 +3,7 @@ import os
 from theano import Apply, Op
 from theano.tensor.extra_ops import CumsumOp
 from .type import GpuArrayType
+from .basic_ops import infer_context_name
 try:
    from pygpu import gpuarray
 except ImportError:
@@ -40,6 +41,9 @@ class GpuCumsum(GpuKernelBase, Op):
    def make_node(self, x):
        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"

+        context_name = infer_context_name(x)
+        x = as_gpuarray_variable(x, context_name)
+
        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            raise NotImplementedError('Only cumsum on 1D, 2D and\
                                       3D arrays are supported right now!')
@@ -467,5 +471,5 @@ def use_gpu_cumsumop(op, ctx_name, inputs, outputs):
        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0
-        assert isinstance(x.type, GpuArrayType)
+
        return GpuCumsum(axis)(x)
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -243,9 +243,6 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
            return None
    except NotScalarConstantError:
        return None
-
-    node = op.make_node(*inputs)
-    outputs = node.outputs
    m, = outputs
    if (p.dtype == u.dtype == m.dtype == 'float32'):
        gpu_op = GPUAMultinomialFromUniform(op.odtype)

--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
@@ -158,7 +158,7 @@ def local_dot_to_gemm16(op, ctx_name, inputs, outputs):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = inputs[0].fgraph
-        C = gpu_alloc_empty(dtype='float16', context_name=ctx_name)(
+        C = gpu_alloc_empty(ctx_name, dtype='float16')(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)


--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -59,11 +59,10 @@ _logger = logging.getLogger("theano.gpuarray.opt")


 gpu_optimizer = EquilibriumDB()
-gpu_optimizer2 = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()

-old_not_transferred = []
-new_not_transferred = []
+# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
+gpu_optimizer2 = EquilibriumDB()


 class GraphToGPUDB(DB):
@@ -207,8 +206,7 @@ def op_lifter(OP, cuda_only=False):
                    i.tag.context_name = context_name

                new_op = maker(node.op, context_name, node.inputs, node.outputs)
-                if not new_op:
-                    old_not_transferred.append(node)
+
                # This is needed as sometimes new_op inherits from OP.
                if new_op and new_op != node.op:
                    if isinstance(new_op, theano.Op):
@@ -375,8 +373,6 @@ class GraphToGPU(NavigatorOptimizer):
            if not new_ops:
                newnode = node.clone_with_new_inputs([mapping.get(i)
                                                      for i in node.inputs])
-                new_not_transferred.append(newnode)
-
                outputs = newnode.outputs
            elif isinstance(new_ops, (tuple, list)):
                outputs = []
@@ -596,7 +592,7 @@ def local_gpuaallocempty(op, context_name, inputs, outputs):
    # We use _props_dict() to make sure that the GPU op know all the
    # CPU op props.
    dtype = op._props_dict().get('dtype')
-    return gpu_alloc_empty(dtype, context_name)(*inputs)
+    return gpu_alloc_empty(context_name, dtype=dtype)(*inputs)


 @register_opt()
@@ -921,17 +917,14 @@ def local_gpua_subtensor(op, context_name, inputs, outputs):
                isinstance(gpu_x.owner.op, GpuFromHost) and
                # And it is a shared var or an input of the graph.
                not gpu_x.owner.inputs[0].owner):
-            if len(x.clients) == 1 and len(outputs[0].clients) == 1:
-                return
-    # Here is the condition for the GraphToGPU opt. inputs is the
-    # inputs we want to use for the new node
-    if (x.owner and isinstance(x.owner.op, GpuFromHost)):
-        cpu_x = x.owner.inputs[0]
-        # And it is a shared var or an input of the graph.
-        # and is used by only 1 node.
-        # x is in the new graph, so we can't tests its number of clients.
-        if not cpu_x.owner and len(cpu_x.clients) == 1:
-            return
+            if len(x.clients) == 1:
+                if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
+                                              for v in n.inputs + n.outputs])
+                        for n, _ in outputs[0].clients]):
+                    return
+                else:
+                    return [host_from_gpu(gpu_x.owner.op(outputs[0]))]
+
    return GpuSubtensor(op.idx_list)


@@ -1146,8 +1139,8 @@ def local_gpua_hgemm(op, context_name, inputs, outputs):
    B = inputs[1]
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = inputs[0].fgraph
-        C = gpu_alloc_empty('float16', context_name)(
+        fgraph = outputs[0].fgraph
+        C = gpu_alloc_empty(context_name, dtype='float16')(
            shape_i(A, 0, fgraph),
            shape_i(B, 1, fgraph))
        return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@@ -1198,7 +1191,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
    x, y, a = inputs
    x = as_gpuarray_variable(x, context_name)
    y = as_gpuarray_variable(y, context_name)
-    z = gpu_alloc_empty(x.dtype, context_name)(x.shape[0], y.shape[1])
+    z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1])
    return [gpugemm_no_inplace(z, a, x, y, 0)]


@@ -1298,7 +1291,7 @@ def local_inplace_sparseblockouter(node):


 # This deals with any abstract convs that have a transfer somewhere
-@register_opt('fast_compile', 'conv_dnn')
+@register_opt('fast_compile')
 @op_lifter([AbstractConv2d,
            AbstractConv2d_gradWeights,
            AbstractConv2d_gradInputs])

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = gpu_alloc_empty(alloc.owner.op.dtype,
-                                           alloc.owner.op.context_name)
+                alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -24,7 +24,7 @@ from . import multinomial

 import theano.sandbox.cuda
 from theano.sandbox.cuda import GpuOp
-from theano.gpuarray.basic_ops import GpuKernelBase, Kernel
+from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
 from theano.gpuarray.type import GpuArrayType
 from theano.gpuarray.fp16_help import write_w
 from theano.gpuarray.opt import (register_opt as register_gpua,
@@ -1567,13 +1567,8 @@ def local_gpua_mrg1(op, context_name, inputs, outputs):
 @local_optimizer([mrg_uniform])
 def local_gpua_mrg(node):
    # TODO : need description for function
-    if (type(node.op) == mrg_uniform and
-            isinstance(node.inputs[0].type, GpuArrayType)):
-        outs = GPUA_mrg_uniform.new(node.inputs[0],
-                                    node.op.output_type.ndim,
-                                    node.op.output_type.dtype,
-                                    node.inputs[1])
-        return [outs[0], host_from_gpua(outs[1])]
+    context_name = infer_context_name(*node.inputs)
+    return local_gpua_mrg1(node.op, context_name, node.inputs, node.outputs)


 MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)