Optimize Neon™ Logistic Activation
- Use a 1d execution window to improve memory access pattern.

Resolves: [COMPMID-5465]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: Ida30669ffa06eb002ca43a6edf15e25a6eaad2f6
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8344
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index fa152c9..a4d46db 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -234,15 +234,15 @@
 
 std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src0, const ITensorInfo &src1)
 {
-    const auto &shape0 = src0.tensor_shape();
-    const auto &shape1 = src1.tensor_shape();
-    const auto &strides0 = src0.strides_in_bytes();
-    const auto &strides1 = src1.strides_in_bytes();
-    const auto num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions());
+    const auto &shape0         = src0.tensor_shape();
+    const auto &shape1         = src1.tensor_shape();
+    const auto &strides0       = src0.strides_in_bytes();
+    const auto &strides1       = src1.strides_in_bytes();
+    const auto  num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions());
 
     Window win;
     size_t split_dimension = Window::DimY;
-    size_t dim = 0;
+    size_t dim             = 0;
 
     size_t squashed_bytes = src0.element_size();
 
@@ -282,4 +282,47 @@
 
     return std::make_pair(win, split_dimension);
 }
+
+std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &src)
+{
+    const auto &shape          = src.tensor_shape();
+    const auto &strides        = src.strides_in_bytes();
+    const auto  num_dimensions = src.num_dimensions();
+
+    Window win;
+    size_t split_dimension = Window::DimY;
+    size_t dim             = 0;
+    size_t squashed_bytes  = src.element_size();
+
+    // Try to squash the low dimensions together.
+    for(; dim < num_dimensions; ++dim)
+    {
+        if(strides[dim] != squashed_bytes)
+        {
+            break;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    if(dim == num_dimensions)
+    {
+        const auto squashed_elements = squashed_bytes / src.element_size();
+        split_dimension              = Window::DimX;
+        // The input tensor can be interpreted as 1D array.
+        win.set(0, Window::Dimension(0, squashed_elements, 1));
+        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, 1, 1));
+        }
+    }
+    else
+    {
+        // Generate the max window.
+        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        {
+            win.set(dim, Window::Dimension(0, shape[dim], 1));
+        }
+    }
+    return std::make_pair(win, split_dimension);
+}
+
 } // namespace arm_compute