Add some optimizations and start work on the ops needed for a simple NN.

d8cc425e · Arnaud Bergeron · cd18dd6b · d8cc425e · d8cc425e · d8cc425e
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -43,6 +43,8 @@ if pygpu:
            # TODO add optimization tags here (when we will have some)
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
+            optdb.add_tags('gpuarray_after_fusion', 'fast_run', 'inplace')
        elif config.gpuarray.init_device != '':
            init_dev(config.gpuarray.init_device)
        else:

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -31,6 +31,9 @@ class HostFromGpu(Op):
    def __hash__(self):
        return hash(type(self))

+    def __str__(self):
+        return 'HostFromGpu(gpuarray)'
+
    def make_node(self, x):
        if not isinstance(x.type, GpuArrayType):
            raise TypeError(x)
@@ -110,6 +113,9 @@ class GpuFromHost(Op):
    def __hash__(self):
        return hash(type(self))

+    def __str__(self):
+        return 'GpuFromHost(gpuarray)'
+
    def make_node(self, x):
        if not isinstance(x.type, tensor.TensorType):
            raise TypeError(x)
@@ -193,6 +199,9 @@ class GpuFromCuda(Op):
    def __hash__(self):
        return hash(type(self))

+    def __str__(self):
+        return 'GpuFromCuda'
+
    def make_node(self, x):
        from theano.sandbox.cuda import CudaNdArrayType
        if not isinstance(x.type, CudaNdArrayType):
@@ -328,6 +337,9 @@ class CudaFromGpu(Op):
    def __hash__(self):
        return hash(type(self))

+    def __str__(self):
+        return 'GpuFromCuda'
+
    def make_node(self, x):
        from theano.sandbox.cuda import CudaNdArrayType
        if not isinstance(x.type, GpuArrayType):
@@ -412,3 +424,44 @@ class CudaFromGpu(Op):


 cuda_from_gpu = CudaFromGpu()
+
+
+class GpuAlloc(Op):
+
+    def __str__(self):
+        return 'GpuAlloc'
+
+    def make_node(self, value, *shape):
+        v = as_gpuarray_variable(value)
+        sh = [tensor.as_tensor_variable(s) for s in shape]
+        if v.ndim = len(shape):
+            raise TypeError(
+                'GpuAlloc requires value of same dimensions as shape',
+                value len(shape))
+        bcast = []
+        for s in sh:
+            if s.type.dtype[:3] not in ('int', 'uint'):
+                raise TypeError('Shape arguments must be integers', s)
+            try:
+                const_shp = tensor.get_constant_value(s)
+            except TypeError:
+                const_shp = None
+            bcast.append(numpy.all(1 == const_shp))
+        otype = GpuArrayType(dtype=v.dtype, broadcastable=bcast)
+        return Apply(self, [v] + sh, [otype()])
+
+    def perform(self, node, inputs, outs):
+        out, = outs
+        v = inputs[0]
+        sh = tuple(map(int, inputs[1:]))
+        if out[0] is None or out[0].shape != sh:
+            out[0] = gpuarray.empty(sh, dtype=v.dtype)
+        out[0][...] = v
+
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs[1:]]
+
+    def grad(self, input, grads):
+        return [None for i in inputs]
+
+gpu_alloc = GpuAlloc()
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
+from theano.compile import optdb
+from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
+                        Optimizer, toolbox, DestroyHandler,
+                        InconsistencyError, EquilibriumOptimizer)
+
+from theano.gof.python25 import all, any
+from theano.sandbox.gpuarray.type import GpuArrayType
+
+gpu_optimizer = EquilibriumDB()
+gpu_cut_copies = EquilibriumDB()
+
+gpu_seqopt = SequenceDB()
+
+gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
+                    'fast_run', 'inplace', 'gpuarray')
+gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
+                    'fast_run', 'gpuarray')
+
+optdb.register('gpuarray_opt', gpu_seqopt,
+               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
+               'gpu')
+
+optdb.register('gpuarray_after_fusion', ProxyDB(gpu_seqopt),
+               optdb.__position__.get('elemwise_fusion', 71) + 1,
+               'gpu')
+
+def register_opt(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpu', *tags)
+        return local_opt
+    return f
+
+register_opt()(theano.tensor.opt.local_track_shape_i)
+
+class InputToGpuOptimizer(Optimizer):
+    "Transfer the input to the gpu to start the rolling wave."
+
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
+
+    def apply(self, fgraph):
+        for input in fgraph.inputs:
+            if isinstance(input.type, GpuArrayType):
+                return
+            
+            if (len(input.clients) == 1 and
+                (input.clients[0][0] == 'output' or
+                 input.clients[0][0].op == gpu_from_host)):
+                return
+
+            try:
+                new_input = host_from_gpu(gpu_from_host(input))
+                fgraph.replace_validate(input, new_input,
+                                        "InputToGpuOptimizer")
+            except TypeError, e:
+                # This could fail if the inputs are not TensorTypes
+                pass
+
+gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
+                    0, 'fast_run', 'fast_compile', 'merge')
+
+@local_optimizer([])
+def local_cut_gpu_host_gpu(node):
+    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
+        return [node.inputs[0].owner.inputs[0]]
+    if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
+        return [node.inputs[0].owner.inputs[0]]
+    return False
+gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
+                        'fast_run', 'inplace', 'gpu')
+gpu_cut_copies.register('cut_gpu_constant_transfers',
+                        tensor.opt.constant_folding,
+                        'fast_run', 'gpu')
+optdb['canonicalize'].register('local_cut_gpu_host_gpu',
+                               local_cut_gpu_host_gpu, 'fast_run', 'gpu')