Fix indexing of SVE ArithmeticAddition kernel

The kernel used wrong index when different data types
with different widths are used as sources.

The increment of the index for the loop inside the kernel
and offset of that index to load multiple source vectors
has been corrected.

Resolves: COMPMID-4303

Change-Id: Ib1ad431dc80c937d7f19bafe5cb57fc52b6f3735
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5304
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp
index 5bd2e12..ae74bfa 100644
--- a/src/core/cpu/kernels/add/sve/integer.cpp
+++ b/src/core/cpu/kernels/add/sve/integer.cpp
@@ -154,9 +154,9 @@
                 const auto vsrc1_0  = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
                 const auto vsrc1_1  = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
                 svst1_s16(pg_0, output_ptr + x, svadd_s16_z(pg_0, vsrc0_0, vsrc1_0));
-                svst1_s16(pg_1, output_ptr + x, svadd_s16_z(pg_1, vsrc0_1, vsrc1_1));
+                svst1_s16(pg_1, output_ptr + x + svcnth(), svadd_s16_z(pg_1, vsrc0_1, vsrc1_1));
 
-                x += svcnth();
+                x += svcntb();
                 pg_u = svwhilelt_b8(x, window_end_x);
                 pg_0 = svwhilelt_b16(x, window_end_x);
                 pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
@@ -172,15 +172,15 @@
             do
             {
                 const auto vsrc0_0  = svld1_s16(pg_0, input1_ptr + x);
-                const auto vsrc0_1  = svld1_s16(pg_1, input1_ptr + x);
+                const auto vsrc0_1  = svld1_s16(pg_1, input1_ptr + x + svcnth());
                 const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
                 const auto vsrc1_0  = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
                 const auto vsrc1_1  = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
 
                 svst1_s16(pg_0, output_ptr + x, svqadd(vsrc0_0, vsrc1_0));
-                svst1_s16(pg_1, output_ptr + x, svqadd(vsrc0_1, vsrc1_1));
+                svst1_s16(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_1, vsrc1_1));
 
-                x += svcnth();
+                x += svcntb();
                 pg_u = svwhilelt_b8(x, window_end_x);
                 pg_0 = svwhilelt_b16(x, window_end_x);
                 pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);