added GpuAlloc and opt to put them into the graph and test.

3c51a2f1 · Frederic Bastien · a163846d · 3c51a2f1 · 3c51a2f1 · 3c51a2f1
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -105,7 +105,7 @@ if cuda_available:
    import basic_ops
    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, 
            GpuDimShuffle, GpuSum, GpuReshape, 
-            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape,
+            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuAlloc,
            GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4
                           , scalar, vector, matrix, row, col, tensor3, tensor4)
    import opt

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1817,6 +1817,76 @@ class GpuJoin(tensor.Join):
 gpu_join = GpuJoin()
+class GpuAlloc(Op):
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, value, *shape):
+        #if their is unneeded transfert generated by the next line
+        #the optimizer will remove them.
+        v = gpu_from_host(tensor.as_tensor_variable(value))
+        sh = [tensor.as_tensor_variable(s) for s in shape]
+        assert all(v.broadcastable)
+        bcast = []
+        for s in sh:
+            if s.type.dtype[:3] not in ('int', 'uin'):
+                raise TypeError('Shape arguments must be integers', s)
+            # if s is constant 1, then we're broadcastable in that dim
+            try:
+                const_shp = tensor.get_constant_value(s)
+            except TypeError:
+                const_shp = None
+            bcast.append(numpy.all(1 == const_shp))
+        otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
+        return Apply(self, [v]+sh, [otype()])
+    def perform(self, node, inputs, (out,)):
+        v = inputs[0]
+        sh = tuple([int(i) for i in inputs[1:]])
+        if out[0] is None or out[0].shape != sh:
+            out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
+        out[0][...] = v # broadcast v to fill us up
+    def c_code(self, node, name, inputs, (out,), sub):
+        value = inputs[0]
+        shps = inputs[1:]
+        nd = len(shps)
+        str =  "int dims[%(nd)s];\n"%locals()
+        for idx,sh in enumerate(shps):
+            str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n"%locals()
+        str += "if(%(out)s==NULL\n"%locals()
+        for idx,sh in enumerate(shps):
+            str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]"%locals()
+        str+="""){
+        Py_XDECREF(%(out)s);
+        %(out)s= (CudaNdarray*)CudaNdarray_new_null();
+        CudaNdarray_alloc_contiguous(%(out)s, %(nd)s, dims);
+    }
+    CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true);
+"""%locals()
+        return str
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs[1:]]
+    def grad(self, inputs, (gout,)):
+        return [None for i in inputs]
+    def c_code_cache_version(self):
+        return (1,)
+gpu_alloc = GpuAlloc()
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Usefull mostly for test as the gpu op are inserted automatically...

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -594,3 +594,19 @@ else:
    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
+@register_opt()
+@local_optimizer([tensor.Alloc])
+def local_gpualloc(node):
+    if node.op == tensor.alloc:
+#I commented the 2 first case as I don't see how this can happen.
+#If you see it happening, add a test for it or contact the mailing list.
+#        if node.inputs[0].owner and node.inputs[0].owner.op==host_from_gpu:#if the input was on the gpu
+#            new_node = host_from_gpu(gpu_alloc(*node.inputs))
+#            return [new_node]
+#        elif all([c!='output' and c.op == gpu_from_host for c,idx in node.outputs[0].clients]):#if all clients are on gpu
+#            new_node = host_from_gpu(gpu_alloc(*node.inputs))
+#            return [new_node]
+        if all([c.op == tensor.join and all([i.owner and i.owner.op in [host_from_gpu,tensor.alloc] for i in c.inputs[1:]]) for c,idx in node.outputs[0].clients]):#if the client is a subtensor with input on gpu or alloc
+            new_node = host_from_gpu(gpu_alloc(*node.inputs))
+            return [new_node]
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -16,6 +16,7 @@ if cuda_ndarray.cuda_available == False:
 import theano.sandbox.cuda as tcn
 import theano.sandbox.cuda as cuda
+import theano.sandbox.cuda.basic_ops as B
 import theano.compile.mode
 from theano.tests import unittest_tools as utt
@@ -626,6 +627,32 @@ def test_gpujoin_no_rebroadcast():
    l = f.maker.env.toposort()
    assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
+def test_gpujoin_gpualloc():
+    a = T.fmatrix('a')
+    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    b = T.fmatrix('b')
+    b_val = numpy.asarray(numpy.random.rand(3,5),dtype='float32')
+    f = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_without_gpu)
+    f_gpu = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b)), mode=mode_with_gpu)
+    f_gpu2 = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_with_gpu)
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==2
+    assert sum([node.op == T.join for node in f.maker.env.toposort()])==1
+    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==2
+    assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()])==1
+    assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()])==2
+    assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()])==1
+    assert numpy.allclose(f(a_val,b_val),f_gpu2(a_val,b_val))
+    #print f.maker.env.toposort()
+    #print f_gpu.maker.env.toposort()
+    #print f_gpu2.maker.env.toposort()
+    #print f(a_val,b_val)
+    #print f_gpu(a_val,b_val)
+    #print f_gpu2(a_val,b_val)
 if __name__ == '__main__':
    test_gpujoin_twomatrices_joincolumns()
    test_gpujoin_assert_cndas()