Optimize Neon™ Logistic Activation
- Use a 1d execution window to improve memory access pattern.

Resolves: [COMPMID-5465]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: Ida30669ffa06eb002ca43a6edf15e25a6eaad2f6
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8344
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index fa152c9..a4d46db 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -234,15 +234,15 @@
 
 std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src0, const ITensorInfo &src1)
 {
-    const auto &shape0 = src0.tensor_shape();
-    const auto &shape1 = src1.tensor_shape();
-    const auto &strides0 = src0.strides_in_bytes();
-    const auto &strides1 = src1.strides_in_bytes();
-    const auto num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions());
+    const auto &shape0         = src0.tensor_shape();
+    const auto &shape1         = src1.tensor_shape();
+    const auto &strides0       = src0.strides_in_bytes();
+    const auto &strides1       = src1.strides_in_bytes();
+    const auto  num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions());
 
     Window win;
     size_t split_dimension = Window::DimY;
-    size_t dim = 0;
+    size_t dim             = 0;
 
     size_t squashed_bytes = src0.element_size();
 
@@ -282,4 +282,47 @@
 
     return std::make_pair(win, split_dimension);
 }
+
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src)
+{
+    const auto &shape          = src.tensor_shape();
+    const auto &strides        = src.strides_in_bytes();
+    const auto  num_dimensions = src.num_dimensions();
+
+    Window win;
+    size_t split_dimension = Window::DimY;
+    size_t dim             = 0;
+    size_t squashed_bytes  = src.element_size();
+
+    // Try to squash the low dimensions together.
+    for(; dim < num_dimensions; ++dim)
+    {
+        if(strides[dim] != squashed_bytes)
+        {
+            break;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    if(dim == num_dimensions)
+    {
+        const auto squashed_elements = squashed_bytes / src.element_size();
+        split_dimension              = Window::DimX;
+        // The input tensor can be interpreted as 1D array.
+        win.set(0, Window::Dimension(0, squashed_elements, 1));
+        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, 1, 1));
+        }
+    }
+    else
+    {
+        // Generate the max window.
+        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, shape[dim], 1));
+        }
+    }
+    return std::make_pair(win, split_dimension);
+}
+
 } // namespace arm_compute
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index c9e5a13..eccf7f2 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h
@@ -176,6 +176,18 @@
     return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
 }
 
+/** Calculate the squashed or maximum window for the given tensor shape.
+ *
+ * If the tensor data resides continuously in the memory, the tensor can be interpreted
+ * as 1D array and all the dimensions can be squashed together into the x-dimension.
+ * Otherwise, generate the max window for the given tensor shape.
+ *
+ * @param[in] src Tensor info object defining the shape of the input tensor.
+ *
+ * @return The maximum window the kernel can be executed on and the preferred split dimension.
+ */
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src);
+
 /** Calculate the squashed or maximum window for the given tensor shapes.
  *
  * If the tensor data resides continuously in the memory, the tensor can be interpreted
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 61efcb2..f1e4858 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -182,10 +182,16 @@
 
 void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
 {
+    ARM_COMPUTE_UNUSED(dst);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
 
     const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), activation_info.activation() });
+    if(dst != nullptr)
+    {
+        // dst auto inizialitation if not yet initialized
+        auto_init_if_empty(*dst, *src->clone());
+    }
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -200,10 +206,20 @@
 #endif // __aarch64__
     _act_info = activation_info;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICPPKernel::configure(win_config.second);
+    Window win;
+
+    if(src->data_layout() != DataLayout::NHWC)
+    {
+        // Use squashed window
+        std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+        ICPPKernel::configure(win);
+    }
+    else
+    {
+        // Configure kernel window
+        win = calculate_max_window(*src, Steps());
+        ICPPKernel::configure(win);
+    }
 }
 
 Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index d856a93..fe2d783 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -73,6 +73,15 @@
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+     *
+     * @return The split dimension hint.
+     */
+    size_t get_split_dimension_hint() const
+    {
+        return _split_dimension;
+    }
+
     struct ActivationKernel
     {
         const char                                *name;
@@ -85,6 +94,7 @@
 private:
     ActivationLayerInfo _act_info{};
     ActivationKernelPtr _run_method{ nullptr };
+    size_t              _split_dimension{ Window::DimY };
     std::string         _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
index 3945fa5..197e985 100644
--- a/src/cpu/operators/CpuActivation.cpp
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "src/cpu/operators/CpuActivation.h"
 
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
@@ -46,6 +47,13 @@
     return kernels::CpuActivationKernel::validate(input, output, activation_info);
 }
 
+void CpuActivation::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+
 std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
index 9b97c9d..f1807d5 100644
--- a/src/cpu/operators/CpuActivation.h
+++ b/src/cpu/operators/CpuActivation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,9 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
 };
 } // namespace cpu
 } // namespace arm_compute