提交 bb53ed07 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2664 from f0k/gpu-alloc-empty

GpuAllocEmpty for dnn_conv()
...@@ -3257,38 +3257,13 @@ class GpuSplit(tensor.Split, GpuOp): ...@@ -3257,38 +3257,13 @@ class GpuSplit(tensor.Split, GpuOp):
return Apply(self, [x] + node.inputs[1:], outs) return Apply(self, [x] + node.inputs[1:], outs)
class GpuAlloc(GpuOp): class GpuAllocEmpty(GpuOp):
"""Implement Alloc on the gpu. """Implement Alloc on the gpu, but without initializing memory."""
__props__ = ()
The memset_0 param is an optimization. When True, we call
cudaMalloc that is faster.
"""
def __init__(self, memset_0=False):
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self):
#Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
def make_node(self, value, *shape): @staticmethod
#if their is unneeded transfert generated by the next line def validate_shape(shape):
#the optimizer will remove them.
v = as_cuda_ndarray_variable(value)
sh = [tensor.as_tensor_variable(s) for s in shape] sh = [tensor.as_tensor_variable(s) for s in shape]
if v.ndim != len(shape):
value = tensor.shape_padleft(value, len(shape) - v.ndim)
bcast = [] bcast = []
for s in sh: for s in sh:
if s.type.dtype[:3] not in ('int', 'uin'): if s.type.dtype[:3] not in ('int', 'uin'):
...@@ -3300,23 +3275,26 @@ class GpuAlloc(GpuOp): ...@@ -3300,23 +3275,26 @@ class GpuAlloc(GpuOp):
const_shp = None const_shp = None
bcast.append(numpy.all(1 == const_shp)) bcast.append(numpy.all(1 == const_shp))
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast) otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
return Apply(self, [v] + sh, [otype()]) output = otype()
return sh, output
def make_node(self, *shape):
shape, output = self.validate_shape(shape)
output.values_eq_approx = tensor.type.values_eq_approx_always_true
return Apply(self, shape, [output])
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_):
out, = out_ out, = out_
v = inputs[0] sh = tuple([int(i) for i in inputs])
sh = tuple([int(i) for i in inputs[1:]])
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
# XXX: We could implement and call CudaNdarray.empty(sh) instead.
out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh) out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inputs, out_, sub): def c_code(self, node, name, inputs, out_, sub):
out, = out_ out, = out_
fail = sub['fail'] fail = sub['fail']
value = inputs[0] shps = inputs
shps = inputs[1:]
nd = len(shps) nd = len(shps)
memset_0 = int(self.memset_0)
str = "int dims[%(nd)s];\n" % locals() str = "int dims[%(nd)s];\n" % locals()
for idx, sh in enumerate(shps): for idx, sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals() str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
...@@ -3340,6 +3318,65 @@ class GpuAlloc(GpuOp): ...@@ -3340,6 +3318,65 @@ class GpuAlloc(GpuOp):
%(fail)s; %(fail)s;
} }
} }
""" % locals()
return str
def infer_shape(self, node, input_shapes):
return [node.inputs]
def c_code_cache_version(self):
return (1,)
def do_constant_folding(self, node):
return False
gpu_alloc_empty = GpuAllocEmpty()
class GpuAlloc(GpuAllocEmpty):
"""Implement Alloc on the gpu.
The memset_0 param is an optimization. When True, we call
cudaMemset that is faster.
"""
__props__ = ('memset_0',)
def __init__(self, memset_0=False):
self.memset_0 = memset_0
def __str__(self):
#Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
def make_node(self, value, *shape):
#if there is unneeded transfert generated by the next line
#the optimizer will remove them.
v = as_cuda_ndarray_variable(value)
shape, output = self.validate_shape(shape)
return Apply(self, [v] + shape, [output])
def perform(self, node, inputs, out_):
# the super class (GpuAllocEmpty) allocates memory, we fill it
value = inputs[0]
shps = inputs[1:]
super(GpuAlloc, self).perform(node, shps, out_)
out, = out_
out[0][...] = value # broadcast value to fill us up
def c_code(self, node, name, inputs, out_, sub):
# the super class (GpuAllocEmpty) allocates memory, we fill it
value = inputs[0]
shps = inputs[1:]
str = super(GpuAllocEmpty, self).c_code(node, name, shps, out_, sub)
out, = out_
fail = sub['fail']
memset_0 = int(self.memset_0)
str += """
if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s)) if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
{ {
cudaError_t err = cudaMemset(%(out)s->devdata, 0, cudaError_t err = cudaMemset(%(out)s->devdata, 0,
...@@ -3369,10 +3406,6 @@ class GpuAlloc(GpuOp): ...@@ -3369,10 +3406,6 @@ class GpuAlloc(GpuOp):
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
return [node.inputs[1:]] return [node.inputs[1:]]
def grad(self, inputs, grads):
gout, = grads
return [None for i in inputs]
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (9,)
......
...@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp ...@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, host_from_gpu,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
gpu_alloc) gpu_alloc_empty)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad) GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax from theano.sandbox.cuda.nnet import GpuSoftmax
...@@ -443,8 +443,8 @@ class GpuDnnConv(DnnBase, COp): ...@@ -443,8 +443,8 @@ class GpuDnnConv(DnnBase, COp):
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc) d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc) d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -519,8 +519,8 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -519,8 +519,8 @@ class GpuDnnConvGradW(DnnBase, COp):
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc) d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc) d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -586,8 +586,8 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -586,8 +586,8 @@ class GpuDnnConvGradI(DnnBase, COp):
img = gpu_contiguous(img) img = gpu_contiguous(img)
d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc) d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc) d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -675,7 +675,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -675,7 +675,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = gpu_alloc(_zero.clone(), shape_i(kerns, 1, fgraph), out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3) shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape) conv_mode='cross')(img.shape, out.shape)
...@@ -692,7 +692,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -692,7 +692,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = gpu_alloc(_zero.clone(), shape_i(img, 0, fgraph), out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3) shape_i(kerns, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape) conv_mode=conv_mode)(out.shape, kerns.shape)
...@@ -709,9 +709,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -709,9 +709,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out = gpu_alloc(_zero.clone(), out = gpu_alloc_empty(*out_shp)
out_shp[0], out_shp[1],
out_shp[2], out_shp[3])
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc) return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
......
...@@ -630,6 +630,10 @@ def values_eq_approx_remove_inf_nan(a, b): ...@@ -630,6 +630,10 @@ def values_eq_approx_remove_inf_nan(a, b):
return TensorType.values_eq_approx(a, b, True, True) return TensorType.values_eq_approx(a, b, True, True)
def values_eq_approx_always_true(a, b):
return True
# Register TensorType C code for ViewOp. # Register TensorType C code for ViewOp.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
TensorType, TensorType,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论