Add output operator for dynamic fusion

* The output of the fused operator must be explicitly specified
  using GpuOutput operator.
* Any temporary tensors used to connect the output of an operator
  to the input of another operator will be marked as no-alloc
  and won't be allocated as a tensor in the memory.

Resolves: COMPMID-5771
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I5ae8e800f8f737db23a055a92b01c4f1d78c3bb8
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8794
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 996bf15..6c1e0fb 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -110,8 +110,8 @@
 R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
     {
-        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{dst}}_w * {{dst}}_h) - 1);
-        g_dst_indirect_y[i].v += g_ind_2 * (int)({{dst}}_w * {{dst}}_h);
+        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{out}}_w * {{out}}_h) - 1);
+        g_dst_indirect_y[i].v += g_ind_2 * (int)({{out}}_w * {{out}}_h);
     })
     }
     //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
@@ -194,6 +194,7 @@
         lut["lhs"] = vtable.get_variable(_lhs);
         lut["rhs"] = vtable.get_variable(_rhs);
         lut["dst"] = vtable.get_variable(_dst);
+        lut["out"] = vtable.get_variable(comp_group.get_dst_tensors().front());
     }
     else
     {