ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN

Added approximate values for MWS for the following binary operators:
Add, Sub, Mul, Min, Max, Div

Change-Id: I5c4c75511129982a3f44c038ee272f09598469de
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Signed-off-by: fadara01 <fadi.arafeh@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp
index 6382ffd..d6d91df 100644
--- a/src/common/cpuinfo/CpuModel.cpp
+++ b/src/common/cpuinfo/CpuModel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,7 @@
         case CpuModel::X1:
         case CpuModel::V1:
         case CpuModel::A64FX:
+        case CpuModel::N1:
             return true;
         default:
             return false;
@@ -69,6 +70,7 @@
         case CpuModel::A510:
         case CpuModel::X1:
         case CpuModel::V1:
+        case CpuModel::N1:
             return true;
         default:
             return false;
@@ -116,9 +118,11 @@
                     model = CpuModel::GENERIC_FP16;
                 }
                 break;
+            case 0xd0c: // N1
+                model = CpuModel::N1;
+                break;
             case 0xd06: // A65
             case 0xd0b: // A76
-            case 0xd0c: // N1
             case 0xd0d: // A77
             case 0xd0e: // A76AE
             case 0xd41: // A78
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 1648a46..ec210a4 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -33,6 +33,11 @@
 #include "src/cpu/kernels/add/list.h"
 #include <array>
 
+namespace
+{
+    static constexpr size_t default_mws_N1_fp32_neon = 24536;
+    static constexpr size_t default_mws_V1_fp32_neon = 40510;
+}
 namespace arm_compute
 {
 namespace cpu
@@ -267,8 +272,41 @@
 size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
 {
     ARM_COMPUTE_UNUSED(thread_count);
-    ARM_COMPUTE_UNUSED(platform);
 
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &add_fp32_neon)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
 }
 
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index e2062c8..9921fea 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -85,7 +85,7 @@
      * @param[in] platform     The CPU platform used to create the context.
      * @param[in] thread_count Number of threads in the execution.
      *
-     * @return[out] small_network_mws          Minimum workload size for requsted configuration.
+     * @return[out] mws Minimum workload size for requested configuration.
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index 4b285fc..e76b05f 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -32,6 +32,14 @@
 
 #include <arm_neon.h>
 
+namespace
+{
+    static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+    static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+    static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
+    static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
+}
+
 namespace arm_compute
 {
 namespace cpu
@@ -401,6 +409,48 @@
     return Status{};
 }
 
+size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
+    || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_min_max_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_min_max_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
 /** The division operator */
 
 void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
@@ -410,6 +460,47 @@
     CpuArithmeticKernel::configure_common(src0, src1, dst);
 }
 
+size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_div_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_div_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
 Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index 2785e0a..634e38b 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -72,8 +72,8 @@
     static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
 protected:
-    std::function<ElementwiseFunction> _run_method{ nullptr };
-    std::string                        _name{};
+    ElementwiseKernelPtr _run_method{ nullptr };
+    std::string          _name{};
 };
 
 class CpuArithmeticKernel : public CpuElementwiseKernel<CpuArithmeticKernel>
@@ -100,6 +100,15 @@
 
     static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
 
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
 protected:
     /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
      */
@@ -108,16 +117,6 @@
     static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
     ArithmeticOperation _op{};
-
-private:
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
 };
 
 class CpuDivisionKernel : public CpuArithmeticKernel
@@ -141,6 +140,15 @@
      */
     static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
 protected:
     // Inherited methods overridden:
     static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index 82e5445..81bb85c 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -34,6 +34,11 @@
 
 #include <arm_neon.h>
 
+namespace
+{
+    static constexpr size_t default_mws_N1_fp32_neon = 22447;
+    static constexpr size_t default_mws_V1_fp32_neon = 38982;
+}
 namespace arm_compute
 {
 namespace cpu
@@ -1909,6 +1914,47 @@
     ICpuKernel::configure(win);
 }
 
+size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_func_float == &mul_F32_F32_F32)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
 Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
                               RoundingPolicy rounding_policy)
 {
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index c92e1ef..73ffc0d 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -81,6 +81,15 @@
     const char *name() const override;
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
     /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
       *
       * @return The split dimension hint.
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index d908e4e..ad74dda 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -31,6 +31,11 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/sub/neon/list.h"
 
+namespace
+{
+    static constexpr size_t default_mws_N1_fp32_neon = 24385;
+    static constexpr size_t default_mws_V1_fp32_neon = 40520;
+}
 namespace arm_compute
 {
 namespace cpu
@@ -137,6 +142,47 @@
     ICpuKernel::configure(win);
 }
 
+size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &sub_same_neon<float>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
 Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index e835bac..3d80b34 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -73,6 +73,15 @@
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return[out] mws Minimum workload size for requested configuration.
+     */
+    size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
     struct SubKernel
     {
         const char                  *name;