提交 6701568f authored 作者: sentient07's avatar sentient07

Changed interface of gpu_alloc_empty and few cleanups

上级 8f602f6f
...@@ -973,7 +973,22 @@ def empty_like(var): ...@@ -973,7 +973,22 @@ def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape) return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
def gpu_alloc_empty(dtype, ctx): def gpu_alloc_empty(ctx, **kwargs):
'''
This is the cache method of GpuAllocEmpty class.
This takes the parameters of context name and props_dict
and retrieves the dtype key from the dictionary
Parameters
----------
ctx : String
The context name.
kwargs : Dict
The props_dict of the Op
'''
dtype = kwargs.get('dtype')
key = (dtype, ctx) key = (dtype, ctx)
if key not in gpu_alloc_empty.cache: if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx) gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
......
...@@ -937,7 +937,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -937,7 +937,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = gpu_alloc_empty(img.dtype, ctx_name)( out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3) shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
...@@ -955,7 +955,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -955,7 +955,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = gpu_alloc_empty(img.dtype, ctx_name)(shape_i(img, 0, fgraph), out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape2, shape3) shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
...@@ -977,7 +977,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -977,7 +977,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp = get_conv_output_shape(ishape, kshape, out_shp = get_conv_output_shape(ishape, kshape,
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out = gpu_alloc_empty(img.dtype, ctx_name)(*out_shp) out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc) return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
...@@ -991,7 +991,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', ...@@ -991,7 +991,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
kerns_shp = as_tensor_variable(kerns_shp) kerns_shp = as_tensor_variable(kerns_shp)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample, desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp) conv_mode=conv_mode)(kerns_shp)
out = gpu_alloc_empty(img.dtype, ctx_name)(*kerns_shp) out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
return gpu_dnn_conv_gradW()(img, topgrad, out, desc) return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
...@@ -1005,7 +1005,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', ...@@ -1005,7 +1005,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
img_shp = as_tensor_variable(img_shp) img_shp = as_tensor_variable(img_shp)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample, desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape) conv_mode=conv_mode)(kerns.shape)
out = gpu_alloc_empty(kerns.dtype, ctx_name)(*img_shp) out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc) return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
...@@ -1480,7 +1480,7 @@ def local_abstractconv_cudnn(node): ...@@ -1480,7 +1480,7 @@ def local_abstractconv_cudnn(node):
return return
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
conv_groupopt.register('local_abstractconv_cudnn_graph', conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20, local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run', 'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn') 'gpuarray', 'conv_dnn', 'cudnn')
...@@ -1549,7 +1549,7 @@ def local_dnn_convi_output_merge(node, *inputs): ...@@ -1549,7 +1549,7 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([Pool]) @op_lifter([Pool])
@register_opt2([Pool], 'fast_compile') @register_opt2([Pool], 'fast_compile', 'cudnn')
def local_pool_dnn_alternative(op, ctx_name, inputs, outputs): def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
...@@ -1566,7 +1566,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs, outputs): ...@@ -1566,7 +1566,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
@register_opt2([MaxPoolGrad], 'fast_compile') @register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
...@@ -1591,7 +1591,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): ...@@ -1591,7 +1591,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
@register_opt2([AveragePoolGrad], 'fast_compile') @register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
...@@ -1645,7 +1645,7 @@ def local_log_softmax_dnn(node): ...@@ -1645,7 +1645,7 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax]) @op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], 'fast_compile') @register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
def local_logsoftmax_to_dnn(op, ctx_name, inputs, outputs): def local_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax # Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0] inp = inputs[0]
......
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
from theano import Apply, Op from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp from theano.tensor.extra_ops import CumsumOp
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import infer_context_name
try: try:
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
...@@ -40,6 +41,9 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -40,6 +41,9 @@ class GpuCumsum(GpuKernelBase, Op):
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS: if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\ raise NotImplementedError('Only cumsum on 1D, 2D and\
3D arrays are supported right now!') 3D arrays are supported right now!')
...@@ -467,5 +471,5 @@ def use_gpu_cumsumop(op, ctx_name, inputs, outputs): ...@@ -467,5 +471,5 @@ def use_gpu_cumsumop(op, ctx_name, inputs, outputs):
# ``gpu_cumsum`` assume array has been flattened if needed. # ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None: if axis is None:
axis = 0 axis = 0
assert isinstance(x.type, GpuArrayType)
return GpuCumsum(axis)(x) return GpuCumsum(axis)(x)
...@@ -243,9 +243,6 @@ def local_gpua_multinomial(op, context_name, inputs, outputs): ...@@ -243,9 +243,6 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
return None return None
except NotScalarConstantError: except NotScalarConstantError:
return None return None
node = op.make_node(*inputs)
outputs = node.outputs
m, = outputs m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'): if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(op.odtype) gpu_op = GPUAMultinomialFromUniform(op.odtype)
......
...@@ -158,7 +158,7 @@ def local_dot_to_gemm16(op, ctx_name, inputs, outputs): ...@@ -158,7 +158,7 @@ def local_dot_to_gemm16(op, ctx_name, inputs, outputs):
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = inputs[0].fgraph fgraph = inputs[0].fgraph
C = gpu_alloc_empty(dtype='float16', context_name=ctx_name)( C = gpu_alloc_empty(ctx_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
...@@ -59,11 +59,10 @@ _logger = logging.getLogger("theano.gpuarray.opt") ...@@ -59,11 +59,10 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_optimizer2 = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
old_not_transferred = [] # Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
new_not_transferred = [] gpu_optimizer2 = EquilibriumDB()
class GraphToGPUDB(DB): class GraphToGPUDB(DB):
...@@ -207,8 +206,7 @@ def op_lifter(OP, cuda_only=False): ...@@ -207,8 +206,7 @@ def op_lifter(OP, cuda_only=False):
i.tag.context_name = context_name i.tag.context_name = context_name
new_op = maker(node.op, context_name, node.inputs, node.outputs) new_op = maker(node.op, context_name, node.inputs, node.outputs)
if not new_op:
old_not_transferred.append(node)
# This is needed as sometimes new_op inherits from OP. # This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
...@@ -375,8 +373,6 @@ class GraphToGPU(NavigatorOptimizer): ...@@ -375,8 +373,6 @@ class GraphToGPU(NavigatorOptimizer):
if not new_ops: if not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i) newnode = node.clone_with_new_inputs([mapping.get(i)
for i in node.inputs]) for i in node.inputs])
new_not_transferred.append(newnode)
outputs = newnode.outputs outputs = newnode.outputs
elif isinstance(new_ops, (tuple, list)): elif isinstance(new_ops, (tuple, list)):
outputs = [] outputs = []
...@@ -596,7 +592,7 @@ def local_gpuaallocempty(op, context_name, inputs, outputs): ...@@ -596,7 +592,7 @@ def local_gpuaallocempty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
dtype = op._props_dict().get('dtype') dtype = op._props_dict().get('dtype')
return gpu_alloc_empty(dtype, context_name)(*inputs) return gpu_alloc_empty(context_name, dtype=dtype)(*inputs)
@register_opt() @register_opt()
...@@ -921,17 +917,14 @@ def local_gpua_subtensor(op, context_name, inputs, outputs): ...@@ -921,17 +917,14 @@ def local_gpua_subtensor(op, context_name, inputs, outputs):
isinstance(gpu_x.owner.op, GpuFromHost) and isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph. # And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner): not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1 and len(outputs[0].clients) == 1: if len(x.clients) == 1:
return if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
# Here is the condition for the GraphToGPU opt. inputs is the for v in n.inputs + n.outputs])
# inputs we want to use for the new node for n, _ in outputs[0].clients]):
if (x.owner and isinstance(x.owner.op, GpuFromHost)): return
cpu_x = x.owner.inputs[0] else:
# And it is a shared var or an input of the graph. return [host_from_gpu(gpu_x.owner.op(outputs[0]))]
# and is used by only 1 node.
# x is in the new graph, so we can't tests its number of clients.
if not cpu_x.owner and len(cpu_x.clients) == 1:
return
return GpuSubtensor(op.idx_list) return GpuSubtensor(op.idx_list)
...@@ -1146,8 +1139,8 @@ def local_gpua_hgemm(op, context_name, inputs, outputs): ...@@ -1146,8 +1139,8 @@ def local_gpua_hgemm(op, context_name, inputs, outputs):
B = inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = inputs[0].fgraph fgraph = outputs[0].fgraph
C = gpu_alloc_empty('float16', context_name)( C = gpu_alloc_empty(context_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph)) shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0) return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
...@@ -1198,7 +1191,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs): ...@@ -1198,7 +1191,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs x, y, a = inputs
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name) y = as_gpuarray_variable(y, context_name)
z = gpu_alloc_empty(x.dtype, context_name)(x.shape[0], y.shape[1]) z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1])
return [gpugemm_no_inplace(z, a, x, y, 0)] return [gpugemm_no_inplace(z, a, x, y, 0)]
...@@ -1298,7 +1291,7 @@ def local_inplace_sparseblockouter(node): ...@@ -1298,7 +1291,7 @@ def local_inplace_sparseblockouter(node):
# This deals with any abstract convs that have a transfer somewhere # This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile', 'conv_dnn') @register_opt('fast_compile')
@op_lifter([AbstractConv2d, @op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
......
...@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx): ...@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
if (alloc.owner and if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1): len(alloc.clients) > 1):
alloc_op = gpu_alloc_empty(alloc.owner.op.dtype, alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs) inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs) return maker(node, inputs)
return opt return opt
......
...@@ -24,7 +24,7 @@ from . import multinomial ...@@ -24,7 +24,7 @@ from . import multinomial
import theano.sandbox.cuda import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua, from theano.gpuarray.opt import (register_opt as register_gpua,
...@@ -1567,13 +1567,8 @@ def local_gpua_mrg1(op, context_name, inputs, outputs): ...@@ -1567,13 +1567,8 @@ def local_gpua_mrg1(op, context_name, inputs, outputs):
@local_optimizer([mrg_uniform]) @local_optimizer([mrg_uniform])
def local_gpua_mrg(node): def local_gpua_mrg(node):
# TODO : need description for function # TODO : need description for function
if (type(node.op) == mrg_uniform and context_name = infer_context_name(*node.inputs)
isinstance(node.inputs[0].type, GpuArrayType)): return local_gpua_mrg1(node.op, context_name, node.inputs, node.outputs)
outs = GPUA_mrg_uniform.new(node.inputs[0],
node.op.output_type.ndim,
node.op.output_type.dtype,
node.inputs[1])
return [outs[0], host_from_gpua(outs[1])]
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform) MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论