提交 ad8571e0 authored 作者: f0k's avatar f0k

Add GpuAllocEmpty and use it in dnn_conv()

上级 0838ae4d
......@@ -3257,37 +3257,19 @@ class GpuSplit(tensor.Split, GpuOp):
return Apply(self, [x] + node.inputs[1:], outs)
class GpuAlloc(GpuOp):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMalloc that is faster.
"""
def __init__(self, memset_0=False):
self.memset_0 = memset_0
class GpuAllocEmpty(GpuOp):
"""Implement Alloc on the gpu, but without initializing memory."""
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
return type(self) == type(other)
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
return hash(type(self))
def __str__(self):
#Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
return self.__class__.__name__
def make_node(self, value, *shape):
#if their is unneeded transfert generated by the next line
#the optimizer will remove them.
v = as_cuda_ndarray_variable(value)
def make_node(self, *shape):
sh = [tensor.as_tensor_variable(s) for s in shape]
if v.ndim != len(shape):
value = tensor.shape_padleft(value, len(shape) - v.ndim)
bcast = []
for s in sh:
......@@ -3300,23 +3282,20 @@ class GpuAlloc(GpuOp):
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
return Apply(self, [v] + sh, [otype()])
return Apply(self, sh, [otype()])
def perform(self, node, inputs, out_):
out, = out_
v = inputs[0]
sh = tuple([int(i) for i in inputs[1:]])
sh = tuple([int(i) for i in inputs])
if out[0] is None or out[0].shape != sh:
# XXX: We could implement and call CudaNdarray.empty(sh) instead.
out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inputs, out_, sub):
out, = out_
fail = sub['fail']
value = inputs[0]
shps = inputs[1:]
shps = inputs
nd = len(shps)
memset_0 = int(self.memset_0)
str = "int dims[%(nd)s];\n" % locals()
for idx, sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
......@@ -3340,6 +3319,75 @@ class GpuAlloc(GpuOp):
%(fail)s;
}
}
""" % locals()
return str
def infer_shape(self, node, input_shapes):
return [node.inputs]
def grad(self, inputs, grads):
gout, = grads
return [None for i in inputs]
def c_code_cache_version(self):
return (1,)
def do_constant_folding(self, node):
# XXX: anything needed here?
return False
gpu_alloc_empty = GpuAllocEmpty()
class GpuAlloc(GpuAllocEmpty):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMemset that is faster.
"""
def __init__(self, memset_0=False):
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self):
#Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
def make_node(self, value, *shape):
node = super(GpuAlloc, self).make_node(*shape)
node.outputs[0].owner = None
#if their is unneeded transfert generated by the next line
#the optimizer will remove them.
v = as_cuda_ndarray_variable(value)
if v.ndim != len(shape):
value = tensor.shape_padleft(value, len(shape) - v.ndim)
return Apply(self, [v] + node.inputs, node.outputs)
def perform(self, node, inputs, out_):
# the super class (GpuAllocEmpty) allocates memory, we fill it
super(GpuAlloc, self).perform(node, inputs, out_)
out, = out_
out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inputs, out_, sub):
# the super class (GpuAllocEmpty) allocates memory, we fill it
value = inputs[0]
shps = inputs[1:]
str = super(GpuAllocEmpty, self).c_code(node, name, shps, out_, sub)
out, = out_
fail = sub['fail']
memset_0 = int(self.memset_0)
str += """
if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
{
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
......@@ -3367,10 +3415,6 @@ class GpuAlloc(GpuOp):
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
def grad(self, inputs, grads):
gout, = grads
return [None for i in inputs]
def c_code_cache_version(self):
return (7,)
......
......@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu,
gpu_contiguous, HostFromGpu,
gpu_alloc)
gpu_alloc_empty)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax
......@@ -443,8 +443,8 @@ class GpuDnnConv(DnnBase, COp):
top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_img = GpuDnnConvGradI()(kerns, top, img, desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns, desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
......@@ -519,8 +519,8 @@ class GpuDnnConvGradW(DnnBase, COp):
kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_img = GpuDnnConvGradI()(kerns, top, img, desc)
d_top = GpuDnnConv()(img, kerns, top, desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
......@@ -586,8 +586,8 @@ class GpuDnnConvGradI(DnnBase, COp):
img = gpu_contiguous(img)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns, desc)
d_top = GpuDnnConv()(img, kerns, top, desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
......@@ -675,7 +675,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = gpu_alloc(_zero.clone(), shape_i(kerns, 1, fgraph),
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape)
......@@ -692,7 +692,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = gpu_alloc(_zero.clone(), shape_i(img, 0, fgraph),
out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
......@@ -709,9 +709,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc(_zero.clone(),
out_shp[0], out_shp[1],
out_shp[2], out_shp[3])
out = gpu_alloc_empty(*out_shp)
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论