Interpreting tensor as 1D for CPU multiplication

* Also fix a bug in mul_U8_U8_U8.

Resolves: COMPMID-5460
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: Ie1edafeae7aaad91164caeeb04661a8974a7fc1b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8244
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index da7b6d7..2f04bf9 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -531,11 +531,11 @@
             }
             if(is_sat)
             {
-                vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
+                vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
             }
             else
             {
-                vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
+                vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
             }
         }
 
@@ -1618,7 +1618,8 @@
     }
 
     // Configure kernel window
-    Window win = calculate_max_window(out_shape);
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
 
     ICpuKernel::configure(win);
 }