Enable fusion of GpuElemwise ops.

fcbd4a34 · Arnaud Bergeron · 2bb9365e · fcbd4a34 · fcbd4a34
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -95,6 +95,19 @@ class GpuElemwise(Op):
                                      sub=dict(fail='return;'))
        res.tag.kcode = kcode

+# Translate types for scalar composite ops (except complex).
+        support_code = """
+#define npy_float64 ga_double
+#define npy_float32 ga_float
+#define npy_uint8 ga_ubyte
+#define npy_int8 ga_byte
+#define npy_uint16 ga_ushort
+#define npy_int16 ga_short
+#define npy_uint32 ga_uint
+#define npy_int32 ga_int
+#define npy_uint64 ga_ulong
+#define npy_int64 ga_long
+"""
        try:
            code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
            if code:

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -142,3 +142,27 @@ def local_gpu_elemwise(node):
            return [host_from_gpu(gpu_elemwise)]
    else:
        return False
+
+def max_inputs_to_GpuElemwise(node):
+    ptr_size = 8
+    int_size = 4
+
+    # we take the limit from CUDA for now
+    argument_limit = 232
+    ndim = node.inputs[0].type.ndim
+    # number of elements and shape
+    size_param_mandatory = (int_size * (ndim + 1)) + \
+        (ptr_size + int_size * ndim) * len(node.outputs)
+
+    nb_bytes_avail = argument_limit - size_param_mandatory
+    nb_bytes_per_input = ptr_size + ndim * int_size
+    max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
+
+    return max_nb_inputs
+
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
+    GpuElemwise,
+    max_inputs_to_GpuElemwise)
+optdb.register('gpu_elemwise_fusion',
+               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
+               71.00, 'fast_run', 'fusion', 'local_elemwise_fusion', 'gpu')