Fix doxygen warnings

Resolves: COMPMID-6312
Signed-off-by: ramy.elgammal@arm.com <ramy.elgammal@arm.com>
Change-Id: I9f68ccd2edb8c4d03fec19e6b9c29609d4833342
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9806
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index e4ed47e..0e938cb 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@
  * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
  * @param[in] n0     The number of consecutive columns to access. n0 + offset must be <= 16
  * @param[in] x      Vector to access
- * @{
+ *
  */
 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
 #define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
@@ -281,6 +281,7 @@
  */
 #define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 #define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @}*/ // end of group LOAD_TENSOR_M0XN0
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ROW_n
@@ -492,7 +493,7 @@
     LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
     VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
     (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
-/** @} */ // end of groupd LOAD_ROW_PARTIAL_n
+/** @} */ // end of group LOAD_ROW_PARTIAL_n
 
 /** Partially load a block of the given size LOAD_M0xLOAD_N0
  * @name LOAD_BLOCK_PARTIAL
@@ -697,6 +698,7 @@
     LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+/** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_TEXTURE2D_ROW_n
@@ -952,6 +954,7 @@
         BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##F = 0;
+/** @} */ // end of group LOAD_ROW_INDIRECT_n
 
 /** Load blocks (consecutive rows and columns) with Y offset.
  * @name LOAD_BLOCK_INDIRECT
@@ -975,6 +978,7 @@
  */
 #define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
 #define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+/** @} */ // end of group LOAD_BLOCK_INDIRECT
 
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ELEMENT_n
@@ -1328,7 +1332,7 @@
 #define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 16)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
-/** @} */ // end of group COLUMN_VECTORn
+/** @} */ // end of group COLUMN_VECTOR_SCALARn
 
 /** Create transposed vectors of the given vectors
  * @name TRANSPOSE_K0Xn
@@ -1561,6 +1565,7 @@
 #define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
     BASENAME##F += BIAS;
+/** @} */ // end of group ADD_ROW_BROADCAST_n
 
 /** Broadcast (add a value) to the each element of the destination block (BASENAME)
  * @name ADD_BLOCK_BROADCAST
@@ -1674,6 +1679,7 @@
  * @param[in] DATA_TYPE    The data type of the vectors
  * @param[in] BASENAME_SRC The basename of the source variables
  * @param[in] BASENAME_DST The basename of the destination variables
+ * @{
  */
 #define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
     VEC_DATA_TYPE(DATA_TYPE, N)                                 \
@@ -1765,7 +1771,8 @@
  * @param[in] DATA_TYPE    The data type of the vectors
  * @param[in] BASENAME_SRC The basename of the source variables
  * @param[in] BASENAME_DST The basename of the destination variables
+ * @{
  */
 #define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
 #define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-/** @} */ // end of group CONVERT_BLOCK
\ No newline at end of file
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index c5d94cc..e0fd8dc 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -463,6 +463,7 @@
  */
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
 #define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+/** @} */
 
 /** Utility macro to write a 2D OpenCL image object.
  *
@@ -479,6 +480,7 @@
  */
 #define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
 #define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+/** @} */
 
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 56b1538..4ba2b2c 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -463,8 +463,6 @@
     }
 /** @} */ // end of group STORE_BLOCK_PARTIAL
 
-#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
 /** Boundary-aware GEMM block store
  * @name STORE_BLOCK_BOUNDARY_AWARE
  * This macro assumes the following schemes to achieve boundary-awareness:
@@ -516,6 +514,7 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  * @{
  */
+#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
 #define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
@@ -541,7 +540,6 @@
 #endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 /** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
 
-#if defined(PARTIAL_STORE_M0)
 /** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
  * @name COMPUTE_M0_START_ROW
  * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
@@ -558,6 +556,7 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
  * @{
  */
+#if defined(PARTIAL_STORE_M0)
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
 #else // defined(PARTIAL_STORE_M0)
@@ -567,7 +566,7 @@
 /** @} */ // end of group COMPUTE_M0_START_ROW
 
 /** Store a vector that can only be partial in x.
- *
+ * @name STORE_VECTOR_SELECT
  * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
  *
  * The data to store is expected to end in a 0.
@@ -583,4 +582,4 @@
  */
 #define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
     STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
-/** @} */ // end of group STORE_VECTOR_SELECT
\ No newline at end of file
+/** @} */ // end of group STORE_VECTOR_SELECT
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index 5b8d2be..e6d0e53 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -52,7 +52,8 @@
  *
  * @return A 16-component vector in QASYMM8 format, saturated to fit
  */
-uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
+template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
+qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
 
 /** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
  *
@@ -64,7 +65,8 @@
  *
  * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
  */
-int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
+template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
+qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
 
 /** Performs final quantization step on 16 elements
  *
@@ -716,11 +718,6 @@
     return { pa, pb };
 }
 
-template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
-qasymm8x16_signed_t vmlaq_qasymm8(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
-
-template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
-qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
 } // namespace arm_compute
 #include "src/core/NEON/NEAsymm.inl"
 #endif // ARM_COMPUTE_NEASYMM_H
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index 0f91fe3..82821af 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,6 +61,9 @@
   }
 };
 
+/**
+ * \relates DepthwiseImplementation
+ */
 template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
 const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *depthwise_implementation_list();
 
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
index 78320ce..235aa1b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,9 @@
   }
 };
 
+/**
+ * \relates PoolingImplementation
+ */
 template <typename TInput, typename TOutput, class OutputStage = Nothing>
 const PoolingImplementation<TInput, TOutput, OutputStage> *pooling_implementation_list();