提交 e01db583 authored 作者: sentient07's avatar sentient07

Cache GpuAllocEmpty

上级 a90c7e81
...@@ -31,7 +31,9 @@ from .elemwise import GpuElemwise ...@@ -31,7 +31,9 @@ from .elemwise import GpuElemwise
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter, register_opt2 from .opt import (gpu_seqopt, register_opt, conv_groupopt,
op_lifter, register_opt2, gpu_alloc_empty)
from .opt_util import alpha_merge, output_merge, inplace_allocempty from .opt_util import alpha_merge, output_merge, inplace_allocempty
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
...@@ -896,7 +898,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -896,7 +898,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = GpuAllocEmpty(img.dtype, ctx_name)( out = gpu_alloc_empty(img.dtype, ctx_name)(
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3) shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
...@@ -914,7 +916,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -914,7 +916,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph), out = gpu_alloc_empty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape2, shape3) shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
...@@ -932,7 +934,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -932,7 +934,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp) out = gpu_alloc_empty(img.dtype, ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
...@@ -946,7 +948,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', ...@@ -946,7 +948,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
kerns_shp = as_tensor_variable(kerns_shp) kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp) conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp) out = gpu_alloc_empty(img.dtype, ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc) return GpuDnnConvGradW()(img, topgrad, out, desc)
...@@ -960,7 +962,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', ...@@ -960,7 +962,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
img_shp = as_tensor_variable(img_shp) img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape) conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp) out = gpu_alloc_empty(kerns.dtype, ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc) return GpuDnnConvGradI()(kerns, topgrad, out, desc)
......
...@@ -452,7 +452,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -452,7 +452,7 @@ class GpuCumsum(GpuKernelBase, Op):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
#@register_opt2([CumsumOp], 'fast_compile') @register_opt2([CumsumOp], 'fast_compile')
def use_gpu_cumsumop(op, ctx_name, inputs, ): def use_gpu_cumsumop(op, ctx_name, inputs, ):
if inputs[0].dtype == 'float32': if inputs[0].dtype == 'float32':
axis = op.axis axis = op.axis
...@@ -471,6 +471,3 @@ def use_gpu_cumsumop(op, ctx_name, inputs, ): ...@@ -471,6 +471,3 @@ def use_gpu_cumsumop(op, ctx_name, inputs, ):
axis = 0 axis = 0
return GpuCumsum(axis)(x) return GpuCumsum(axis)(x)
#register_opt('fast_compile')(use_gpu_cumsumop)
#
\ No newline at end of file
...@@ -51,6 +51,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -51,6 +51,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -59,6 +61,14 @@ gpu_optimizer2 = EquilibriumDB() ...@@ -59,6 +61,14 @@ gpu_optimizer2 = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
def gpu_alloc_empty(dtype, ctx):
key = (dtype, ctx)
if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
return gpu_alloc_empty.cache[key]
gpu_alloc_empty.cache = {}
class GraphToGPUDB(DB): class GraphToGPUDB(DB):
""" """
Retrieves the list local optimizers based on the optimizer flag's value Retrieves the list local optimizers based on the optimizer flag's value
...@@ -292,7 +302,7 @@ class GraphToGPU(Optimizer): ...@@ -292,7 +302,7 @@ class GraphToGPU(Optimizer):
move_to_GPU = True move_to_GPU = True
''' '''
out_clients = [o.clients for o in node.outputs] out_clients = [o.clients for o in node.outputs]
context_name = None context_name = None
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论