COMPMID-1401 Implement NEFullyConnectedLayer for QASYMM8

Change-Id: I0404df6d369855e2f458f2db8f26e81c80a1ee87
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140148
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index ee334df..af84d02 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -193,11 +193,14 @@
         Window win_vector_sum_row(collapsed_window);
         win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
         win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
         Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
         Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
         Iterator mm_result(_mm_result, window);
 
+        const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y();
+
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Compute the leftover term due to a_offset.
@@ -217,7 +220,7 @@
             a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
 
             // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y());
             b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
 
             // Add a_offset_term_s32 and b_offset_term_s32
@@ -266,14 +269,17 @@
         Window win_vector_sum_row(collapsed_window);
         win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
         win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
         Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
         Iterator mm_result(_mm_result, window);
 
+        const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y();
+
         execute_window_loop(window, [&](const Coordinates & id)
         {
             // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y());
             b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
 
             int32x4x4_t in_s32 =