COMPMID-3794: Fix window loops causing performance regression

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: Id4d95c6ce5fed91bb079b8bfe1abceedefd20c97
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4117
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index c022fa0..8c11574 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -110,7 +110,7 @@
     execute_window_loop(win, [&](const Coordinates & id)
     {
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
@@ -175,7 +175,7 @@
     execute_window_loop(win, [&](const Coordinates &)
     {
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
@@ -238,7 +238,7 @@
     {
 
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
@@ -323,7 +323,7 @@
     execute_window_loop(win, [&](const Coordinates &)
     {
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 5710897..4ac33d1 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -139,7 +139,7 @@
             const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
 
             int x = window_start_x;
-            for(; x < (window_end_x - window_step_x); x += window_step_x)
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
                 // Compute the leftover term due to a_offset.
                 int32x4x4_t a_offset_term_s32 =
@@ -237,7 +237,7 @@
             const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
 
             int x = window_start_x;
-            for(; x < (window_end_x - window_step_x); x += window_step_x)
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
                 int32x4x4_t in_s32 =
                 {
@@ -291,7 +291,7 @@
             auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
 
             int x = window_start_x;
-            for(; x < (window_end_x - window_step_x); x += window_step_x)
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
                 // Compute the leftover term due to a_offset.
                 int32x4x4_t a_offset_term_s32 =
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 1310ef3..397eae9 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -1316,7 +1316,7 @@
                                  (_input->info()->strides_in_bytes().z());
 
         int x_off = window_start_x;
-        for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
             const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
             const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
@@ -1432,7 +1432,7 @@
         const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
 
         int x_off = window_start_x;
-        for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
             if(pooling_type != PoolingType::MAX)
             {
@@ -1943,7 +1943,7 @@
             const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
 
             int x_off = window_start_x;
-            for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
+            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
                 if(pooling_type != PoolingType::MAX)
                 {
@@ -2113,7 +2113,7 @@
                                  (_input->info()->strides_in_bytes().z());
 
         int x_off = window_start_x;
-        for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
             const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
             const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
@@ -2337,7 +2337,7 @@
         const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
 
         int x_off = window_start_x;
-        for(; x_off < (window_end_x - window_step_x); x_off += window_step_x)
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
             if(pooling_type != PoolingType::MAX)
             {