COMPMID-3794: Fix window loops causing performance regression

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: Id4d95c6ce5fed91bb079b8bfe1abceedefd20c97
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4117
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index c022fa0..8c11574 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -110,7 +110,7 @@
     execute_window_loop(win, [&](const Coordinates & id)
     {
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
@@ -175,7 +175,7 @@
     execute_window_loop(win, [&](const Coordinates &)
     {
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
@@ -238,7 +238,7 @@
     {
 
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
@@ -323,7 +323,7 @@
     execute_window_loop(win, [&](const Coordinates &)
     {
         int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;