Cache GpuAllocEmpty

e01db583 · sentient07 · a90c7e81 · e01db583 · e01db583 · e01db583
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -31,7 +31,9 @@ from .elemwise import GpuElemwise
 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
-from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter, register_opt2
+from .opt import (gpu_seqopt, register_opt, conv_groupopt,
+                  op_lifter, register_opt2, gpu_alloc_empty)
 from .opt_util import alpha_merge, output_merge, inplace_allocempty
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
@@ -896,7 +898,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = GpuAllocEmpty(img.dtype, ctx_name)(
+        out = gpu_alloc_empty(img.dtype, ctx_name)(
            shape_i(kerns, 1, fgraph),
            shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
@@ -914,7 +916,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
+        out = gpu_alloc_empty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
                                                 shape_i(kerns, 1, fgraph),
                                                 shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
@@ -932,7 +934,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
-    out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp)
+    out = gpu_alloc_empty(img.dtype, ctx_name)(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)
@@ -946,7 +948,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    kerns_shp = as_tensor_variable(kerns_shp)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
                          conv_mode=conv_mode)(kerns_shp)
-    out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
+    out = gpu_alloc_empty(img.dtype, ctx_name)(*kerns_shp)
    return GpuDnnConvGradW()(img, topgrad, out, desc)
@@ -960,7 +962,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    img_shp = as_tensor_variable(img_shp)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
                          conv_mode=conv_mode)(kerns.shape)
-    out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
+    out = gpu_alloc_empty(kerns.dtype, ctx_name)(*img_shp)
    return GpuDnnConvGradI()(kerns, topgrad, out, desc)

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -452,7 +452,7 @@ class GpuCumsum(GpuKernelBase, Op):
 @register_opt('fast_compile')
 @op_lifter([CumsumOp])
-#@register_opt2([CumsumOp], 'fast_compile')
+@register_opt2([CumsumOp], 'fast_compile')
 def use_gpu_cumsumop(op, ctx_name, inputs,  ):
    if inputs[0].dtype == 'float32':
        axis = op.axis
@@ -471,6 +471,3 @@ def use_gpu_cumsumop(op, ctx_name, inputs,  ):
            axis = 0
        return GpuCumsum(axis)(x)
-#register_opt('fast_compile')(use_gpu_cumsumop)
-#
\ No newline at end of file
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -51,6 +51,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -59,6 +61,14 @@ gpu_optimizer2 = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
+def gpu_alloc_empty(dtype, ctx):
+    key = (dtype, ctx)
+    if key not in gpu_alloc_empty.cache:
+        gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
+    return gpu_alloc_empty.cache[key]
+gpu_alloc_empty.cache = {}
 class GraphToGPUDB(DB):
    """
    Retrieves the list local optimizers based on the optimizer flag's value
@@ -292,7 +302,7 @@ class GraphToGPU(Optimizer):
                move_to_GPU = True
            '''
            out_clients = [o.clients for o in node.outputs]
            context_name = None