提交 3c51a2f1 authored 作者: Frederic Bastien's avatar Frederic Bastien

added GpuAlloc and opt to put them into the graph and test.

上级 a163846d
......@@ -105,7 +105,7 @@ if cuda_available:
import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape,
GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape,
GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuAlloc,
GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4
, scalar, vector, matrix, row, col, tensor3, tensor4)
import opt
......
......@@ -1817,6 +1817,76 @@ class GpuJoin(tensor.Join):
gpu_join = GpuJoin()
class GpuAlloc(Op):
def __init__(self):
pass
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, value, *shape):
#if their is unneeded transfert generated by the next line
#the optimizer will remove them.
v = gpu_from_host(tensor.as_tensor_variable(value))
sh = [tensor.as_tensor_variable(s) for s in shape]
assert all(v.broadcastable)
bcast = []
for s in sh:
if s.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('Shape arguments must be integers', s)
# if s is constant 1, then we're broadcastable in that dim
try:
const_shp = tensor.get_constant_value(s)
except TypeError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
return Apply(self, [v]+sh, [otype()])
def perform(self, node, inputs, (out,)):
v = inputs[0]
sh = tuple([int(i) for i in inputs[1:]])
if out[0] is None or out[0].shape != sh:
out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inputs, (out,), sub):
value = inputs[0]
shps = inputs[1:]
nd = len(shps)
str = "int dims[%(nd)s];\n"%locals()
for idx,sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n"%locals()
str += "if(%(out)s==NULL\n"%locals()
for idx,sh in enumerate(shps):
str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]"%locals()
str+="""){
Py_XDECREF(%(out)s);
%(out)s= (CudaNdarray*)CudaNdarray_new_null();
CudaNdarray_alloc_contiguous(%(out)s, %(nd)s, dims);
}
CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true);
"""%locals()
return str
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
def grad(self, inputs, (gout,)):
return [None for i in inputs]
def c_code_cache_version(self):
return (1,)
gpu_alloc = GpuAlloc()
# Those are predifined CudaNdarrayType as done in tensor.basic
# Usefull mostly for test as the gpu op are inserted automatically...
......
......@@ -594,3 +594,19 @@ else:
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc(node):
if node.op == tensor.alloc:
#I commented the 2 first case as I don't see how this can happen.
#If you see it happening, add a test for it or contact the mailing list.
# if node.inputs[0].owner and node.inputs[0].owner.op==host_from_gpu:#if the input was on the gpu
# new_node = host_from_gpu(gpu_alloc(*node.inputs))
# return [new_node]
# elif all([c!='output' and c.op == gpu_from_host for c,idx in node.outputs[0].clients]):#if all clients are on gpu
# new_node = host_from_gpu(gpu_alloc(*node.inputs))
# return [new_node]
if all([c.op == tensor.join and all([i.owner and i.owner.op in [host_from_gpu,tensor.alloc] for i in c.inputs[1:]]) for c,idx in node.outputs[0].clients]):#if the client is a subtensor with input on gpu or alloc
new_node = host_from_gpu(gpu_alloc(*node.inputs))
return [new_node]
......@@ -16,6 +16,7 @@ if cuda_ndarray.cuda_available == False:
import theano.sandbox.cuda as tcn
import theano.sandbox.cuda as cuda
import theano.sandbox.cuda.basic_ops as B
import theano.compile.mode
from theano.tests import unittest_tools as utt
......@@ -626,6 +627,32 @@ def test_gpujoin_no_rebroadcast():
l = f.maker.env.toposort()
assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
def test_gpujoin_gpualloc():
a = T.fmatrix('a')
a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
b = T.fmatrix('b')
b_val = numpy.asarray(numpy.random.rand(3,5),dtype='float32')
f = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_without_gpu)
f_gpu = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b)), mode=mode_with_gpu)
f_gpu2 = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==2
assert sum([node.op == T.join for node in f.maker.env.toposort()])==1
assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==2
assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()])==1
assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()])==2
assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()])==1
assert numpy.allclose(f(a_val,b_val),f_gpu2(a_val,b_val))
#print f.maker.env.toposort()
#print f_gpu.maker.env.toposort()
#print f_gpu2.maker.env.toposort()
#print f(a_val,b_val)
#print f_gpu(a_val,b_val)
#print f_gpu2(a_val,b_val)
if __name__ == '__main__':
test_gpujoin_twomatrices_joincolumns()
test_gpujoin_assert_cndas()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论