Update logic in the OpenMP scheduler to exclude LITTLE cores

On systems with BIG/MID/LITTLE cores, we need to exclude the LITTLE cores.

This is make changes to CPUInfo to detect number of LITTLE cores and set the num_threads to TOTAL_CORES-NUM_LITTLE cores

Resolves [COMPMID-7014]

Signed-off-by: Omar Al Khatib <omar.alkhatib@arm.com>
Change-Id: I3e1772e5b64d1c45304860be43233b7e5dd8dba1
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11565
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 139d630..e5322bd 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -170,6 +170,18 @@
      * @return Number of CPUs
      */
     unsigned int get_cpu_num() const;
+    /** Return the maximum number of CPUs present excluding the little cores
+     * in case of an Android device
+     *
+     * @return Number of CPUs excluding little
+     */
+    unsigned int get_cpu_num_excluding_little() const;
+    /** Return whether the device has little, medium and big CPUs in case
+     * of an Android device, returns false otherwise
+     *
+     * @return Whether the device has little, medium and big CPUs
+     */
+    bool cpu_has_little_mid_big() const;
 
     /** Return the vector length in bytes for sme2
      *
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index b522b40..c718e74 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OMPSCHEDULER_H
-#define ARM_COMPUTE_OMPSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
 
 #include "arm_compute/runtime/IScheduler.h"
 
@@ -79,6 +79,8 @@
 
 private:
     unsigned int _num_threads;
+    bool         _has_lmb;
+    unsigned int _nonlittle_num_cpus;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_OMPSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index f493ff6..a5f61d6 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -44,6 +44,7 @@
 v24.05 Public major release
  - Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types
  - Various fixes to enable FP16 kernels in armv8a multi_isa builds.
+ - Updated logic in the OpenMP scheduler to exclude LITTLE cores.
 
 v24.04 Public major release
  - Add Bfloat16 data type support for @ref NEMatMul.
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index 809ab3e..92ba522 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -29,6 +29,7 @@
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
+#include <map>
 #include <sstream>
 
 #if !defined(BARE_METAL)
@@ -269,6 +270,87 @@
     }
     return max_cpus;
 }
+
+const static std::map<std::string, std::vector<uint32_t>> known_configurations_with_little_cores = {
+    {"xiaomi14-pro", {379, 379, 923, 923, 923, 867, 867, 1024}}};
+
+const static std::map<std::string, uint32_t> number_of_cores_to_use = {{"xiaomi14-pro", 6}};
+
+#if defined(__ANDROID__)
+std::vector<uint32_t> get_cpu_capacities()
+{
+    std::vector<uint32_t> cpu_capacities;
+    for (int i = 0; i < get_max_cpus(); ++i)
+    {
+        std::stringstream str;
+        str << "/sys/devices/system/cpu/cpu" << i << "/cpu_capacity";
+        std::ifstream file(str.str(), std::ios::in);
+        if (file.is_open())
+        {
+            std::string line;
+            if (bool(getline(file, line)))
+            {
+                cpu_capacities.emplace_back(support::cpp11::stoul(line));
+            }
+        }
+    }
+
+    return cpu_capacities;
+}
+
+uint32_t not_little_num_cpus_internal()
+{
+    std::vector<uint32_t> cpus_all = get_cpu_capacities();
+    std::vector<uint32_t> cpus_not_little;
+
+    for (auto &it : known_configurations_with_little_cores)
+    {
+        if (it.second == cpus_all)
+        {
+            return number_of_cores_to_use.find(it.first)->second;
+        }
+    }
+
+    std::vector<uint32_t>::iterator result       = std::max_element(cpus_all.begin(), cpus_all.end());
+    uint32_t                        max_capacity = *result;
+    uint32_t                        threshold    = max_capacity / 2;
+    for (unsigned int i = 0; i < cpus_all.size(); i++)
+    {
+        if (!(cpus_all[i] < threshold))
+        {
+            cpus_not_little.emplace_back(cpus_all[i]);
+        }
+    }
+    return cpus_not_little.size();
+}
+
+bool has_little_mid_big_internal()
+{
+    std::vector<uint32_t> cpus_all = get_cpu_capacities();
+    std::vector<uint32_t> cpus_not_little;
+
+    for (auto &it : known_configurations_with_little_cores)
+    {
+        if (it.second == cpus_all)
+        {
+            return true;
+        }
+    }
+    std::sort(cpus_all.begin(), cpus_all.end());
+    std::vector<uint32_t>::iterator ip;
+    ip = std::unique(cpus_all.begin(), cpus_all.end());
+    cpus_all.resize(std::distance(cpus_all.begin(), ip));
+
+    if (cpus_all.size() == 3)
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+#endif /* defined(__ANDROID__) */
 #elif defined(__aarch64__) && \
     defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
 /** Query features through sysctlbyname
@@ -402,6 +484,24 @@
     return _cpus.size();
 }
 
+uint32_t CpuInfo::not_little_num_cpus() const
+{
+#if defined(__ANDROID__)
+    return not_little_num_cpus_internal();
+#else  /* defined(__ANDROID__) */
+    return num_cpus();
+#endif /* defined(__ANDROID__) */
+}
+
+bool CpuInfo::has_little_mid_big() const
+{
+#if defined(__ANDROID__)
+    return has_little_mid_big_internal();
+#else  /* defined(__ANDROID__) */
+    return false;
+#endif /* defined(__ANDROID__) */
+}
+
 uint32_t num_threads_hint()
 {
     unsigned int num_threads_hint = 1;
diff --git a/src/common/cpuinfo/CpuInfo.h b/src/common/cpuinfo/CpuInfo.h
index 953e488..506830a 100644
--- a/src/common/cpuinfo/CpuInfo.h
+++ b/src/common/cpuinfo/CpuInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_COMMON_CPUINFO_H
-#define SRC_COMMON_CPUINFO_H
+#ifndef ACL_SRC_COMMON_CPUINFO_CPUINFO_H
+#define ACL_SRC_COMMON_CPUINFO_CPUINFO_H
 
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 #include "src/common/cpuinfo/CpuModel.h"
@@ -120,6 +120,8 @@
     CpuModel cpu_model(uint32_t cpuid) const;
     CpuModel cpu_model() const;
     uint32_t num_cpus() const;
+    uint32_t not_little_num_cpus() const;
+    bool     has_little_mid_big() const;
 
 private:
     CpuIsaInfo            _isa{};
@@ -135,4 +137,4 @@
 uint32_t num_threads_hint();
 } // namespace cpuinfo
 } // namespace arm_compute
-#endif /* SRC_COMMON_CPUINFO_H */
+#endif // ACL_SRC_COMMON_CPUINFO_CPUINFO_H
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index f6761f2..ef0518e 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -145,5 +145,20 @@
     return 0;
 #endif // ARM_COMPUTE_ENABLE_SME2
 }
-
+bool CPUInfo::cpu_has_little_mid_big() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.has_little_mid_big();
+#else  /* defined(__ANDROID__) */
+    return false;
+#endif /* defined(__ANDROID__) */
+}
+unsigned int CPUInfo::get_cpu_num_excluding_little() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.not_little_num_cpus();
+#else  /* defined(__ANDROID__) */
+    return get_cpu_num();
+#endif /* defined(__ANDROID__) */
+}
 } // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index d4d6193..2a5abb5 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,10 +32,24 @@
 
 namespace arm_compute
 {
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
 OMPScheduler::OMPScheduler() // NOLINT
-    : _num_threads(omp_get_max_threads())
+    : _num_threads(cpu_info().get_cpu_num_excluding_little()),
+      _has_lmb(cpu_info().cpu_has_little_mid_big()),
+      _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
 {
 }
+#else  /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+OMPScheduler::OMPScheduler() // NOLINT
+    : _num_threads(omp_get_max_threads()),
+      _has_lmb(cpu_info().cpu_has_little_mid_big()),
+      _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
+{
+}
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
 
 unsigned int OMPScheduler::num_threads() const
 {
@@ -45,7 +59,15 @@
 void OMPScheduler::set_num_threads(unsigned int num_threads)
 {
     const unsigned int num_cores = omp_get_max_threads();
-    _num_threads                 = (num_threads == 0) ? num_cores : num_threads;
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+    const unsigned int adjusted_num_threads = (_has_lmb) ? _nonlittle_num_cpus : num_threads;
+    _num_threads                            = (num_threads == 0) ? num_cores : adjusted_num_threads;
+#else  /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+    _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
 }
 
 void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 8198119..e126ade 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,24 @@
 {
 TEST_SUITE(NEON)
 TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+    CPUInfo& ci =  arm_compute::Scheduler::get().cpu_info();
+    const uint32_t total_num_cpus = ci.get_cpu_num();
+    const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+    const bool has_lmb = ci.cpu_has_little_mid_big();
+    const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+    if(has_lmb){
+        ARM_COMPUTE_EXPECT(total_num_cpus!=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(num_threads==nonlittle_num_cpus , framework::LogLevel::ERRORS);
+    }
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
 TEST_SUITE(RuntimeContext)
 
 TEST_CASE(Scheduler, framework::DatasetMode::ALL)