ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN Added approximate values for MWS for the following binary operators: Add, Sub, Mul, Min, Max, Div Change-Id: I5c4c75511129982a3f44c038ee272f09598469de Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Signed-off-by: fadara01 <fadi.arafeh@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>

commit: 73bb6b7ad80801e56633ad4ea12b0404b586a979 [log] [tgz]
author: Fadi Arafeh <fadi.arafeh@arm.com> Thu Oct 06 16:20:14 2022 +0000
committer: fadi.arafeh <fadi.arafeh@arm.com> Tue Nov 22 14:04:45 2022 +0000
tree: 9f35a75499df4e1cc49cc6f3336c805384a53c13
parent: ca1a52d14551147456a9a1ea2e24f5c141a6d80e [diff] [blame]
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 1648a46..ec210a4 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp

@@ -33,6 +33,11 @@
 #include "src/cpu/kernels/add/list.h"
 #include <array>
 
+namespace
+{
+    static constexpr size_t default_mws_N1_fp32_neon = 24536;
+    static constexpr size_t default_mws_V1_fp32_neon = 40510;
+}
 namespace arm_compute
 {
 namespace cpu
@@ -267,8 +272,41 @@
 size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
     ARM_COMPUTE_UNUSED(thread_count);
-    ARM_COMPUTE_UNUSED(platform);
 
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &add_fp32_neon)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
 }
commit	73bb6b7ad80801e56633ad4ea12b0404b586a979	[log] [tgz]
author	Fadi Arafeh <fadi.arafeh@arm.com>	Thu Oct 06 16:20:14 2022 +0000
committer	fadi.arafeh <fadi.arafeh@arm.com>	Tue Nov 22 14:04:45 2022 +0000
tree	9f35a75499df4e1cc49cc6f3336c805384a53c13
parent	ca1a52d14551147456a9a1ea2e24f5c141a6d80e [diff] [blame]