提交 6701568f authored 作者: sentient07's avatar sentient07

Changed interface of gpu_alloc_empty and few cleanups

上级 8f602f6f
......@@ -973,7 +973,22 @@ def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
def gpu_alloc_empty(dtype, ctx):
def gpu_alloc_empty(ctx, **kwargs):
'''
This is the cache method of GpuAllocEmpty class.
This takes the parameters of context name and props_dict
and retrieves the dtype key from the dictionary
Parameters
----------
ctx : String
The context name.
kwargs : Dict
The props_dict of the Op
'''
dtype = kwargs.get('dtype')
key = (dtype, ctx)
if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
......
......@@ -937,7 +937,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = gpu_alloc_empty(img.dtype, ctx_name)(
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
......@@ -955,7 +955,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = gpu_alloc_empty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
......@@ -977,7 +977,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp = get_conv_output_shape(ishape, kshape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(img.dtype, ctx_name)(*out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
......@@ -991,7 +991,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
kerns_shp = as_tensor_variable(kerns_shp)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = gpu_alloc_empty(img.dtype, ctx_name)(*kerns_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
......@@ -1005,7 +1005,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
img_shp = as_tensor_variable(img_shp)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = gpu_alloc_empty(kerns.dtype, ctx_name)(*img_shp)
out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
......@@ -1480,7 +1480,7 @@ def local_abstractconv_cudnn(node):
return
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
conv_groupopt.register('local_abstractconv_cudnn_graph',
conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run',
'gpuarray', 'conv_dnn', 'cudnn')
......@@ -1549,7 +1549,7 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt('cudnn', 'fast_compile')
@op_lifter([Pool])
@register_opt2([Pool], 'fast_compile')
@register_opt2([Pool], 'fast_compile', 'cudnn')
def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
......@@ -1566,7 +1566,7 @@ def local_pool_dnn_alternative(op, ctx_name, inputs, outputs):
@register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad])
@register_opt2([MaxPoolGrad], 'fast_compile')
@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
......@@ -1591,7 +1591,7 @@ def local_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
@register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad])
@register_opt2([AveragePoolGrad], 'fast_compile')
@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
def local_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
......@@ -1645,7 +1645,7 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax])
@register_opt2([LogSoftmax], 'fast_compile')
@register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
def local_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax
inp = inputs[0]
......
......@@ -3,6 +3,7 @@ import os
from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp
from .type import GpuArrayType
from .basic_ops import infer_context_name
try:
from pygpu import gpuarray
except ImportError:
......@@ -40,6 +41,9 @@ class GpuCumsum(GpuKernelBase, Op):
def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\
3D arrays are supported right now!')
......@@ -467,5 +471,5 @@ def use_gpu_cumsumop(op, ctx_name, inputs, outputs):
# ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None:
axis = 0
assert isinstance(x.type, GpuArrayType)
return GpuCumsum(axis)(x)
......@@ -243,9 +243,6 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
return None
except NotScalarConstantError:
return None
node = op.make_node(*inputs)
outputs = node.outputs
m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(op.odtype)
......
......@@ -158,7 +158,7 @@ def local_dot_to_gemm16(op, ctx_name, inputs, outputs):
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = inputs[0].fgraph
C = gpu_alloc_empty(dtype='float16', context_name=ctx_name)(
C = gpu_alloc_empty(ctx_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0)
......
......@@ -59,11 +59,10 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer = EquilibriumDB()
gpu_optimizer2 = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
old_not_transferred = []
new_not_transferred = []
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
gpu_optimizer2 = EquilibriumDB()
class GraphToGPUDB(DB):
......@@ -207,8 +206,7 @@ def op_lifter(OP, cuda_only=False):
i.tag.context_name = context_name
new_op = maker(node.op, context_name, node.inputs, node.outputs)
if not new_op:
old_not_transferred.append(node)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
......@@ -375,8 +373,6 @@ class GraphToGPU(NavigatorOptimizer):
if not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i)
for i in node.inputs])
new_not_transferred.append(newnode)
outputs = newnode.outputs
elif isinstance(new_ops, (tuple, list)):
outputs = []
......@@ -596,7 +592,7 @@ def local_gpuaallocempty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the
# CPU op props.
dtype = op._props_dict().get('dtype')
return gpu_alloc_empty(dtype, context_name)(*inputs)
return gpu_alloc_empty(context_name, dtype=dtype)(*inputs)
@register_opt()
......@@ -921,17 +917,14 @@ def local_gpua_subtensor(op, context_name, inputs, outputs):
isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1 and len(outputs[0].clients) == 1:
return
# Here is the condition for the GraphToGPU opt. inputs is the
# inputs we want to use for the new node
if (x.owner and isinstance(x.owner.op, GpuFromHost)):
cpu_x = x.owner.inputs[0]
# And it is a shared var or an input of the graph.
# and is used by only 1 node.
# x is in the new graph, so we can't tests its number of clients.
if not cpu_x.owner and len(cpu_x.clients) == 1:
return
if len(x.clients) == 1:
if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
for v in n.inputs + n.outputs])
for n, _ in outputs[0].clients]):
return
else:
return [host_from_gpu(gpu_x.owner.op(outputs[0]))]
return GpuSubtensor(op.idx_list)
......@@ -1146,8 +1139,8 @@ def local_gpua_hgemm(op, context_name, inputs, outputs):
B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = inputs[0].fgraph
C = gpu_alloc_empty('float16', context_name)(
fgraph = outputs[0].fgraph
C = gpu_alloc_empty(context_name, dtype='float16')(
shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
......@@ -1198,7 +1191,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs
x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name)
z = gpu_alloc_empty(x.dtype, context_name)(x.shape[0], y.shape[1])
z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1])
return [gpugemm_no_inplace(z, a, x, y, 0)]
......@@ -1298,7 +1291,7 @@ def local_inplace_sparseblockouter(node):
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile', 'conv_dnn')
@register_opt('fast_compile')
@op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
......
......@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1):
alloc_op = gpu_alloc_empty(alloc.owner.op.dtype,
alloc.owner.op.context_name)
alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs)
return opt
......
......@@ -24,7 +24,7 @@ from . import multinomial
import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua,
......@@ -1567,13 +1567,8 @@ def local_gpua_mrg1(op, context_name, inputs, outputs):
@local_optimizer([mrg_uniform])
def local_gpua_mrg(node):
# TODO : need description for function
if (type(node.op) == mrg_uniform and
isinstance(node.inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(node.inputs[0],
node.op.output_type.ndim,
node.op.output_type.dtype,
node.inputs[1])
return [outs[0], host_from_gpua(outs[1])]
context_name = infer_context_name(*node.inputs)
return local_gpua_mrg1(node.op, context_name, node.inputs, node.outputs)
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论