Fix include dependencies for mass reformatting patch

This patch fixes some include dependencies in certain files that caused build failures in https://review.mlplatform.org/c/ml/ComputeLibrary/+/10287.

It also circumvents some clang-format glitches.

Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Change-Id: I8e9d3307edd2d1afd17c685c9bc9429624130e5a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10313
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: <felixjohnny.thomasmathibalan@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index a88b193..46a0f62 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -200,6 +200,7 @@
         sum_sq += static_cast<AccType>(vaddv(vmul(val_low, val_low)));
         sum_sq += static_cast<AccType>(vaddv(vmul(val_high, val_high)));
 #else  // __aarch64__
+
         // only AArch64 supports vaddv
         const int64x2_t pair_sum_low  = vpaddl(val_low);
         const int64x2_t pair_sum_high = vpaddl(val_high);
@@ -317,4 +318,4 @@
     },
     input_iterator, output_iterator);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
index ebb64d9..729d9ff 100644
--- a/src/core/NEON/wrapper/traits.h
+++ b/src/core/NEON/wrapper/traits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H
-#define ARM_COMPUTE_WRAPPER_TRAITS_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
+#define ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
+
+#include "arm_compute/core/CoreTypes.h"
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "src/cpu/CpuTypes.h" // required for float16_t
+#endif                        // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 #include <arm_neon.h>
 
@@ -30,6 +36,9 @@
 #include <arm_sve.h>
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 
+#include <cmath>
+#include <cstdint>
+
 namespace arm_compute
 {
 namespace wrapper
@@ -151,4 +160,4 @@
 } // namespace traits
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_TRAITS_H
diff --git a/src/core/NEON/wrapper/wrapper.h b/src/core/NEON/wrapper/wrapper.h
index e5467e9..f3f3c5d 100644
--- a/src/core/NEON/wrapper/wrapper.h
+++ b/src/core/NEON/wrapper/wrapper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_H
-#define ARM_COMPUTE_WRAPPER_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
+#define ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
+
+#include "arm_compute/core/Error.h"
 
 // Traits
 #include "src/core/NEON/wrapper/traits.h"
@@ -31,4 +33,4 @@
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/scalar/scalar.h"
 
-#endif /* ARM_COMPUTE_WRAPPER_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_WRAPPER_H
diff --git a/src/cpu/kernels/cast/generic/neon/bfloat16.cpp b/src/cpu/kernels/cast/generic/neon/bfloat16.cpp
index 91c15be..d8e2756 100644
--- a/src/cpu/kernels/cast/generic/neon/bfloat16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/bfloat16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(ARM_COMPUTE_ENABLE_BF16)
 
+#include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/CpuCastKernel.h"
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 385ca18..6cd0c85 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
+#include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "src/cpu/kernels/CpuCastKernel.h"
 #include "src/cpu/kernels/cast/list.h"
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 447b740..16ac16b 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -215,6 +215,7 @@
             }
         }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
         // For 1x1 convolutions run the default GEMM
         if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
         {
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
index 1bcd37d..e2b8584 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "ckw/TensorTileSampler.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 
 #include <algorithm>