提交 c7ca08bf authored 作者: carriepl's avatar carriepl

Merge pull request #2753 from mducoffe/ccw_2692

AllocEmpty
......@@ -26,7 +26,7 @@ from theano.sandbox.cuda.basic_ops import (
GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
......@@ -570,6 +570,8 @@ def local_gpu_dot22(node):
@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
def local_gpu_dot22scalar(node):
"""
Deprecated : _dot22scalar has been replace by gemm
see Dot22scalar for more details
gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
......@@ -2290,6 +2292,15 @@ def gpuScanOptimization(node):
return False
@register_opt()
@local_optimizer([tensor.AllocEmpty, gpu_from_host])
def local_gpu_allocempty(node):
if (isinstance(node.op, tensor.AllocEmpty) and
node.op.dtype=="float32"):
return [host_from_gpu(GpuAllocEmpty()(*node.inputs))]
return False
optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
gpu_flag=True),
......
......@@ -382,7 +382,7 @@ def test_alloc_empty():
assert out.shape == (2, 3)
assert out.dtype == 'float32'
# Test that we do not merge them.
# Test that we merge them.
f = theano.function([], [cuda.basic_ops.gpu_alloc_empty(2, 3),
cuda.basic_ops.gpu_alloc_empty(2, 3)])
out = f()
......
......@@ -154,6 +154,18 @@ def test_gpualloc():
assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
def test_gpuallocempty():
f_gpu = theano.function([], tensor.AllocEmpty('float32')(2,3),
mode=mode_with_gpu)
l_gpu = f_gpu.maker.fgraph.toposort()
assert numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_gpu])
f_cpu = theano.function([], tensor.AllocEmpty('int32')(2,3))
l_cpu = f_cpu.maker.fgraph.toposort()
assert not numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu])
class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
dtype = 'float32'
......
......@@ -17,7 +17,7 @@ from theano.tensor import elemwise
from theano.tensor.var import (AsTensorError, TensorVariable,
TensorConstant,
_tensor_py_operators)
from theano.tensor.type import TensorType
from theano.tensor.type import TensorType, values_eq_approx_always_true
from theano.tensor.type_other import NoneConst
from theano import scalar as scal
from theano.compat import partial
......@@ -592,7 +592,8 @@ def get_scalar_constant_value(orig_v, elemwise=True,
continue
elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
if isinstance(v.owner.inputs[0], Constant):
return numpy.asarray(v.owner.inputs[0].data.shape[v.owner.op.i])
return numpy.asarray(
v.owner.inputs[0].data.shape[v.owner.op.i])
# Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow.
......@@ -5467,3 +5468,84 @@ class Choose(Op):
choice = inputs[1]
# TODO reuse out?
z[0] = numpy.choose(a, choice, mode=self.mode)
class AllocEmpty(gof.Op):
"""Implement Alloc on the cpu, but without initializing memory."""
__props__ = ("dtype",)
# specify the type of the data
def __init__(self, dtype):
assert isinstance(dtype, str)
self.dtype = dtype.lower()
def validate_shape(self, shape):
sh = [as_tensor_variable(s) for s in shape]
bcast = []
for s in sh:
if s.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('Shape arguments must be integers', s)
# if s is constant 1, then we're broadcastable in that dim
try:
const_shp = get_scalar_constant_value(s)
except NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = TensorType(dtype=self.dtype, broadcastable=bcast)
output = otype()
return sh, output
def make_node(self, *shape):
shape, output = self.validate_shape(shape)
output.tag.values_eq_approx = values_eq_approx_always_true
return Apply(self, shape, [output])
def perform(self, node, inputs, out_):
out, = out_
sh = tuple([int(i) for i in inputs])
if out[0] is None or out[0].shape != sh:
out[0] = numpy.empty(sh, dtype=self.dtype)
def c_code(self, node, name, inputs, out_, sub):
dtype = "NPY_"+self.dtype.upper()
out, = out_
fail = sub['fail']
shps = inputs
nd = len(shps)
str = "npy_intp dims[%(nd)s];\n" % locals()
for idx, sh in enumerate(shps):
str += "dims[%(idx)s] =" \
"((npy_intp)((dtype_%(sh)s*)" \
" PyArray_DATA(%(sh)s))[0]);\n" % locals()
# Validate that the output storage exists
str += "if(%(out)s==NULL\n" % locals()
for idx, sh in enumerate(shps):
str += "||PyArray_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]" % locals()
str += """){
/* Reference received to invalid output variable.
Decrease received reference's ref count and allocate new
output variable */
Py_XDECREF(%(out)s);
%(out)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s,
dims,
%(dtype)s,
0);
if (!%(out)s)
{
PyErr_SetString(PyExc_MemoryError, "alloc failed");
%(fail)s;
}
}
""" % locals()
return str
def infer_shape(self, node, input_shapes):
return [node.inputs]
def c_code_cache_version(self):
return (3,)
def do_constant_folding(self, node):
return False
......@@ -1824,7 +1824,6 @@ def local_dot22_to_ger_or_gemv(node):
# x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0)
yv = y.dimshuffle(1)
zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
rval = ger(zeros, one, xv, yv)
return [rval]
......@@ -1832,19 +1831,19 @@ def local_dot22_to_ger_or_gemv(node):
# x and y are both vectors so this qualifies for a sdot / ddot
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22
xv = x.dimshuffle(1)
zeros = T.zeros([1], x.dtype)
zeros = T.AllocEmpty(x.dtype)(1)
rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
return [rval.dimshuffle('x', 0)]
if xb[0] and not yb[0] and not yb[1]:
# x is vector, y is matrix so try gemv
xv = x.dimshuffle(1)
zeros = T.zeros([y.shape[1]], x.dtype)
zeros = T.AllocEmpty(x.dtype)(y.shape[1])
rval = gemv_no_inplace(zeros, one, y.T, xv, zero)
return [rval.dimshuffle('x', 0)]
if not xb[0] and not xb[1] and yb[1]:
# x is matrix, y is vector, try gemv
yv = y.dimshuffle(0)
zeros = T.zeros([x.shape[0]], dtype=x.dtype)
zeros = T.AllocEmpty(x.dtype)(x.shape[0])
rval = gemv_no_inplace(zeros, one, x, yv, zero)
return [rval.dimshuffle(0, 'x')]
......@@ -2043,8 +2042,12 @@ def local_dot22_to_dot22scalar(node):
a = T.cast(_as_scalar(m.owner.inputs[scalar_idx],
dtype=d.dtype), d.type.dtype)
assert not a.type.ndim
dot = _dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)
z = T.AllocEmpty(d.owner.inputs[0].dtype)(d.owner.inputs[0].shape[0],
d.owner.inputs[1].shape[1])
zero = T.as_tensor_variable(numpy.asarray(0, dtype=a.dtype))
dot = gemm(z, a, d.owner.inputs[0], d.owner.inputs[1], zero)
# The other inputs to the original node that were
# neither part of the dot22 or this mul should be
# factors in the returned "mul" node.
......@@ -2079,11 +2082,16 @@ def local_dot22_to_dot22scalar(node):
a = T.cast(i_scalar[scalar_idx], d.type.dtype)
assert not a.type.ndim
if len(o) == 0:
return [_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)]
z = T.AllocEmpty(d.owner.inputs[0].dtype)(d.owner.inputs[0].shape[0],
d.owner.inputs[1].shape[1])
zero = T.as_tensor_variable(numpy.asarray(0, dtype=a.dtype))
return [gemm(z, a, d.owner.inputs[0], d.owner.inputs[1], zero)]
else:
return [T.mul(_dot22scalar(d.owner.inputs[0],
d.owner.inputs[1], a), *o)]
z = T.AllocEmpty(d.owner.inputs[0].dtype)(d.owner.inputs[0].shape[0],
d.owner.inputs[1].shape[1])
zero = T.as_tensor_variable(numpy.asarray(0, dtype=a.dtype))
return [T.mul(gemm(z, a, d.owner.inputs[0], d.owner.inputs[1],
zero), *o)]
# must happen after gemm as the gemm optimizer don't understant
# dot22scalar and gemm give more speed up then dot22scalar
blas_optdb.register('local_dot22_to_dot22scalar',
......
......@@ -47,7 +47,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
itensor3, Tile, switch, Diagonal, Diag,
nonzero, flatnonzero, nonzero_values,
stacklists, DimShuffle, hessian, ptp, power,
swapaxes, choose, Choose, NoneConst,
swapaxes, choose, Choose, NoneConst, AllocEmpty
)
from theano.tests import unittest_tools as utt
......@@ -7558,6 +7558,15 @@ class T_Choose(utt.InferShapeTester):
# Op that should be removed from the graph.
self.op_class)
def test_allocempty():
# Test that we allocated correctly
f = theano.function([], AllocEmpty("float32")(2, 3))
assert len(f.maker.fgraph.apply_nodes) == 1
out = f()
assert out.shape == (2, 3)
assert out.dtype == 'float32'
"""
if __name__ == '__main__':
......
......@@ -875,28 +875,32 @@ def test_dot22scalar():
cst = theano.tensor.basic.constant(.2, dtype=dtype4)
cst2 = theano.tensor.basic.constant(.1, dtype=dtype4)
def check_dot22scalar(func, len_topo_scalar=-1):
def check_dot22scalar_gemm(func, len_topo_scalar=-1):
topo = func.maker.fgraph.toposort()
ops = [x.op for x in topo]
classes = [type(x.op) for x in topo]
dtype4_upcast = theano.scalar.upcast(dtype4, dtype1,
dtype2)
if dtype1 == dtype2 == dtype3 == dtype4_upcast:
if len_topo_scalar > 0:
assert len(topo) == len_topo_scalar
assert _dot22scalar in ops, (dtype1, dtype2,
assert gemm_inplace in ops, (dtype1, dtype2,
dtype3, dtype4)
elif dtype1 == dtype2 == dtype4_upcast:
if not (len_topo_scalar > 0):
assert len(topo) == len_topo_scalar
assert _dot22scalar in ops, (dtype1, dtype2,
assert gemm_inplace in ops, (dtype1, dtype2,
dtype3, dtype4)
assert not T.Elemwise in classes, (
dtype1, dtype2, dtype3, dtype4)
else:
# Currently there is a problem of
# optimization order The constant get
# upcasted to float64 before we try to
# merge it with the dot22 of
# float32. So this prevent the merge.
assert _dot22scalar in ops or _dot22 in ops, (
assert gemm_inplace in ops or _dot22 in ops, (
dtype1, dtype2, dtype3, dtype4)
elif dtype1 == dtype2:
......@@ -916,7 +920,7 @@ def test_dot22scalar():
f = theano.function([a, b], cst * T.dot(a, b),
mode=mode_blas_opt)
topo = f.maker.fgraph.toposort()
check_dot22scalar(f, 1)
check_dot22scalar_gemm(f, 1)
f(av, bv)
......@@ -925,7 +929,8 @@ def test_dot22scalar():
cst * c * T.dot(a, b),
mode=mode_blas_opt)
topo = f.maker.fgraph.toposort()
check_dot22scalar(f, 2)
check_dot22scalar_gemm(f, 5)
#print (av.dtype, bv.dtype, cv.dtype)
f(av, bv, cv)
......@@ -933,7 +938,7 @@ def test_dot22scalar():
c * cst * T.dot(a, b),
mode=mode_blas_opt)
topo = f.maker.fgraph.toposort()
check_dot22scalar(f, 2)
check_dot22scalar_gemm(f, 5)
f(av, bv, cv)
# Here, canonicalize also seems needed
......@@ -943,7 +948,7 @@ def test_dot22scalar():
cst2 * c * cst * T.dot(a, b),
mode=m2)
topo = f.maker.fgraph.toposort()
check_dot22scalar(f, 2)
check_dot22scalar_gemm(f, 5)
f(av, bv, cv)
if dtype1 == dtype2 == dtype3:
......@@ -951,7 +956,7 @@ def test_dot22scalar():
c * cst * a * T.dot(a, b),
mode=m2)
topo = f.maker.fgraph.toposort()
check_dot22scalar(f, 2)
check_dot22scalar_gemm(f, 5)
f(sv, sv, sv)
f = theano.function([a, b, c],
......@@ -974,7 +979,7 @@ def test_dot22scalar():
c * a * cst * T.dot(a, b),
mode=m2)
topo = f.maker.fgraph.toposort()
check_dot22scalar(f, 2)
check_dot22scalar_gemm(f, 5)
f(sv, sv, sv)
cmp((3, 4), (4, 5), (3, 5))
......@@ -994,7 +999,7 @@ def test_dot22scalar_cast():
for scalar_int_type in T.int_dtypes:
y = T.scalar(dtype=scalar_int_type)
f = theano.function([A, y], T.dot(A, A) * y, mode=mode_blas_opt)
assert _dot22scalar in [x.op for x in f.maker.fgraph.toposort()]
assert gemm_inplace in [x.op for x in f.maker.fgraph.toposort()]
A = T.fmatrix()
for scalar_int_type in T.int_dtypes:
y = T.scalar(dtype=scalar_int_type)
......@@ -1002,7 +1007,7 @@ def test_dot22scalar_cast():
if scalar_int_type in ['int32', 'int64']:
assert _dot22 in [x.op for x in f.maker.fgraph.toposort()]
else:
assert _dot22scalar in [x.op for x in f.maker.fgraph.toposort()]
assert gemm_inplace in [x.op for x in f.maker.fgraph.toposort()]
def test_local_dot22_to_dot22scalar():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论