Add in place summation to CPU GEMM kernels

Instead of dispatching the sum postop for GEMM kernels to a
separate kernel + add, that requires an extra destination sized
allocation, plus 3 extra load/stores per element,
just do it in the GEMM kernel.

Resolves: ONCPUML-1442

Signed-off-by: Radu Salavat <radu.salavat@arm.com>
Co-authored-by: Milos Puzovic <milos.puzovic@arm.com>
Change-Id: I7a1f2da3300875fa1ac88b705a34390969518077
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11298
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 9a913c5..5d7cf79 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
+#define ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP
+
 #pragma once
 
 #include "arm_gemm_local.hpp"
@@ -151,6 +155,7 @@
     int               _maxthreads;
     bool              _fixed_format;
     bool              _fast_mode;
+    bool              _accumulate;
     const GemmConfig *_cfg;
 
     GemmArgs(const CPUInfo    *ci,
@@ -165,6 +170,7 @@
              const int         maxthreads,
              bool              fixed_format = false,
              bool              fast_mode    = false,
+             bool              accumulate   = false,
              const GemmConfig *cfg          = nullptr)
         : _ci(ci),
           _Msize(M),
@@ -178,6 +184,7 @@
           _maxthreads(maxthreads),
           _fixed_format(fixed_format),
           _fast_mode(fast_mode),
+          _accumulate(accumulate),
           _cfg(cfg)
     {
     }
@@ -278,3 +285,5 @@
 bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const OutputStage & = {});
 
 } // namespace arm_gemm
+
+#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP