Optimize Neon™ SUB operator by squashing execution window

Resolves: COMPMID-5462
Change-Id: I2c7151c8faf4016cc33592fff04d492d7cbc8fd6
Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8366
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index c55d11e..d908e4e 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -131,7 +131,8 @@
     _name       = std::string("CpuSubKernel").append("/").append(uk->name);
 
     // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped
-    Window win = calculate_max_window(out_shape, Steps());
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src0, *src1);
 
     ICpuKernel::configure(win);
 }
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index 323a3f1..e835bac 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -82,10 +82,16 @@
 
     static const std::vector<SubKernel> &get_available_kernels();
 
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
 private:
     ConvertPolicy _policy{};
     SubKernelPtr  _run_method{ nullptr };
     std::string   _name{};
+    size_t        _split_dimension{ Window::DimY };
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
index f0a7770..91a5b6e 100644
--- a/src/cpu/operators/CpuSub.cpp
+++ b/src/cpu/operators/CpuSub.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 
 #include "src/common/utils/Log.h"
 
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -45,5 +47,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
 }
+
+void CpuSub::run(ITensorPack &tensors)
+{
+    const auto split_dimension = static_cast<kernels::CpuSubKernel *>(_kernel.get())->get_split_dimension();
+
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
index 025c928..d463d1e 100644
--- a/src/cpu/operators/CpuSub.h
+++ b/src/cpu/operators/CpuSub.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,6 +60,9 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
 };
 } // namespace cpu
 } // namespace arm_compute