提交 1d8e6f26 authored 作者: sentient07's avatar sentient07

Clean up and cached GpuCAReduceCuda

上级 4ebe109a
...@@ -1427,8 +1427,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1427,8 +1427,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
@op_lifter([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights, @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'conv_dnn', 'cudnn', 'gpuarray', 'fast_compile') AbstractConv2d_gradInputs], 'conv_dnn', 'cudnn', 'gpuarray', 'fast_compile')
def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
...@@ -1481,10 +1479,10 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): ...@@ -1481,10 +1479,10 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node): def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs) ctx = infer_context_name(*node.inputs)
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs) return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
conv_groupopt.register('local_abstractconv_cudnn_graph', conv_groupopt.register('local_abstractconv_cudnn_graph',
local_abstractconv_cudnn_graph, 20, local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run', 'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn') 'gpuarray', 'conv_dnn', 'cudnn')
......
...@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return kernels return kernels
# Caching GpuCAReduceCuda
def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
pre_scalar_op=None):
key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
pre_scalar_op)
if key not in gpu_ca_reduce_cuda.cache:
gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
acc_dtype, pre_scalar_op)
return gpu_ca_reduce_cuda.cache[key]
gpu_ca_reduce_cuda.cache = {}
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
""" """
CAReduce that reuse the python code from gpuarray. CAReduce that reuse the python code from gpuarray.
......
...@@ -30,7 +30,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -30,7 +30,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import (GpuArrayType, GpuArrayConstant, get_context, from .type import (GpuArrayType, GpuArrayConstant, get_context,
ContextNotDefined) ContextNotDefined, GpuArrayVariable, GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
host_from_gpu, GpuToGpu, host_from_gpu, GpuToGpu,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
...@@ -48,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, ...@@ -48,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias, gpu_softmax) gpu_softmax_with_bias, gpu_softmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY) GpuCAReduceCPY, gpu_ca_reduce_cuda)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
...@@ -310,7 +310,6 @@ class GraphToGPU(NavigatorOptimizer): ...@@ -310,7 +310,6 @@ class GraphToGPU(NavigatorOptimizer):
# Move only if any of the inputs are on the GPU. # Move only if any of the inputs are on the GPU.
move_to_GPU = False move_to_GPU = False
from .type import GpuArrayVariable, GpuArraySharedVariable
if any([isinstance(i, GpuArrayVariable) or if any([isinstance(i, GpuArrayVariable) or
isinstance(i, GpuArraySharedVariable) isinstance(i, GpuArraySharedVariable)
for i in [mapping[v] for v in node.inputs] + for i in [mapping[v] for v in node.inputs] +
...@@ -883,12 +882,11 @@ def local_gpua_join(op, context_name, inputs, outputs): ...@@ -883,12 +882,11 @@ def local_gpua_join(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@local_optimizer([GpuJoin]) @local_optimizer([GpuJoin])
@register_opt2([GpuJoin], 'fast_compile') def local_gpuajoin_1(node):
def local_gpuajoin_1(op, context_name, inputs, outputs):
# join of a single element # join of a single element
if (isinstance(op, GpuJoin) and if (isinstance(node.op, GpuJoin) and
len(inputs) == 2): len(node.inputs) == 2):
return [inputs[1]] return [node.inputs[1]]
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -1306,7 +1304,7 @@ def local_lift_abstractconv2d(op, context_name, inputs, outputs): ...@@ -1306,7 +1304,7 @@ def local_lift_abstractconv2d(op, context_name, inputs, outputs):
register_opt('fast_compile')(conv_groupopt) register_opt('fast_compile')(conv_groupopt)
@register_opt("low_memory", 'fast_compile') @register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda]) @local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node): def local_gpu_elemwise_careduce(node):
""" """
...@@ -1324,7 +1322,7 @@ def local_gpu_elemwise_careduce(node): ...@@ -1324,7 +1322,7 @@ def local_gpu_elemwise_careduce(node):
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op, return [gpu_ca_reduce_cuda(scalar_op=op.scalar_op,
axis=op.axis, axis=op.axis,
reduce_mask=op.reduce_mask, reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)] pre_scalar_op=scalar.basic.sqr)(inp)]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论