提交 3c51a2f1 authored 作者: Frederic Bastien's avatar Frederic Bastien

added GpuAlloc and opt to put them into the graph and test.

上级 a163846d
...@@ -105,7 +105,7 @@ if cuda_available: ...@@ -105,7 +105,7 @@ if cuda_available:
import basic_ops import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape, GpuDimShuffle, GpuSum, GpuReshape,
GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuAlloc,
GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4 GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4
, scalar, vector, matrix, row, col, tensor3, tensor4) , scalar, vector, matrix, row, col, tensor3, tensor4)
import opt import opt
......
...@@ -1817,6 +1817,76 @@ class GpuJoin(tensor.Join): ...@@ -1817,6 +1817,76 @@ class GpuJoin(tensor.Join):
gpu_join = GpuJoin() gpu_join = GpuJoin()
class GpuAlloc(Op):
def __init__(self):
pass
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, value, *shape):
#if their is unneeded transfert generated by the next line
#the optimizer will remove them.
v = gpu_from_host(tensor.as_tensor_variable(value))
sh = [tensor.as_tensor_variable(s) for s in shape]
assert all(v.broadcastable)
bcast = []
for s in sh:
if s.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('Shape arguments must be integers', s)
# if s is constant 1, then we're broadcastable in that dim
try:
const_shp = tensor.get_constant_value(s)
except TypeError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
return Apply(self, [v]+sh, [otype()])
def perform(self, node, inputs, (out,)):
v = inputs[0]
sh = tuple([int(i) for i in inputs[1:]])
if out[0] is None or out[0].shape != sh:
out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inputs, (out,), sub):
value = inputs[0]
shps = inputs[1:]
nd = len(shps)
str = "int dims[%(nd)s];\n"%locals()
for idx,sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n"%locals()
str += "if(%(out)s==NULL\n"%locals()
for idx,sh in enumerate(shps):
str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]"%locals()
str+="""){
Py_XDECREF(%(out)s);
%(out)s= (CudaNdarray*)CudaNdarray_new_null();
CudaNdarray_alloc_contiguous(%(out)s, %(nd)s, dims);
}
CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true);
"""%locals()
return str
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
def grad(self, inputs, (gout,)):
return [None for i in inputs]
def c_code_cache_version(self):
return (1,)
gpu_alloc = GpuAlloc()
# Those are predifined CudaNdarrayType as done in tensor.basic # Those are predifined CudaNdarrayType as done in tensor.basic
# Usefull mostly for test as the gpu op are inserted automatically... # Usefull mostly for test as the gpu op are inserted automatically...
......
...@@ -594,3 +594,19 @@ else: ...@@ -594,3 +594,19 @@ else:
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion') compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc(node):
if node.op == tensor.alloc:
#I commented the 2 first case as I don't see how this can happen.
#If you see it happening, add a test for it or contact the mailing list.
# if node.inputs[0].owner and node.inputs[0].owner.op==host_from_gpu:#if the input was on the gpu
# new_node = host_from_gpu(gpu_alloc(*node.inputs))
# return [new_node]
# elif all([c!='output' and c.op == gpu_from_host for c,idx in node.outputs[0].clients]):#if all clients are on gpu
# new_node = host_from_gpu(gpu_alloc(*node.inputs))
# return [new_node]
if all([c.op == tensor.join and all([i.owner and i.owner.op in [host_from_gpu,tensor.alloc] for i in c.inputs[1:]]) for c,idx in node.outputs[0].clients]):#if the client is a subtensor with input on gpu or alloc
new_node = host_from_gpu(gpu_alloc(*node.inputs))
return [new_node]
...@@ -16,6 +16,7 @@ if cuda_ndarray.cuda_available == False: ...@@ -16,6 +16,7 @@ if cuda_ndarray.cuda_available == False:
import theano.sandbox.cuda as tcn import theano.sandbox.cuda as tcn
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
import theano.sandbox.cuda.basic_ops as B
import theano.compile.mode import theano.compile.mode
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
...@@ -626,6 +627,32 @@ def test_gpujoin_no_rebroadcast(): ...@@ -626,6 +627,32 @@ def test_gpujoin_no_rebroadcast():
l = f.maker.env.toposort() l = f.maker.env.toposort()
assert not any([isinstance(x.op,T.Rebroadcast) for x in l]) assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
def test_gpujoin_gpualloc():
a = T.fmatrix('a')
a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
b = T.fmatrix('b')
b_val = numpy.asarray(numpy.random.rand(3,5),dtype='float32')
f = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_without_gpu)
f_gpu = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b)), mode=mode_with_gpu)
f_gpu2 = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==2
assert sum([node.op == T.join for node in f.maker.env.toposort()])==1
assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==2
assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()])==1
assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()])==2
assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()])==1
assert numpy.allclose(f(a_val,b_val),f_gpu2(a_val,b_val))
#print f.maker.env.toposort()
#print f_gpu.maker.env.toposort()
#print f_gpu2.maker.env.toposort()
#print f(a_val,b_val)
#print f_gpu(a_val,b_val)
#print f_gpu2(a_val,b_val)
if __name__ == '__main__': if __name__ == '__main__':
test_gpujoin_twomatrices_joincolumns() test_gpujoin_twomatrices_joincolumns()
test_gpujoin_assert_cndas() test_gpujoin_assert_cndas()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论