COMPMID-722 - Support for vector-matrix in GEMMLowp (NEON)

This patch includes COMPMID-716 as well

- Added vector-matrix case in NEGEMMLowpMatrixMultiplyKernel
- Added benchmarks for NEON and OpenCL

Change-Id: I715cd25e8668a4d6c8127e9a298a865e7713267f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111468
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 208a60c..a68a01f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -42,81 +42,439 @@
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
-Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(id.x() > width_b)
+        {
+            return;
+        }
 
-    TensorShape in0_shape = input0->tensor_shape();
-    TensorShape in1_shape = input1->tensor_shape();
-    TensorShape out_shape = output->tensor_shape();
+        // Note: Since the input are all positives, we can use uint32_t
+        // Accumulators for the block 0
+        uint32x4x4_t c0 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
 
-    in0_shape.collapse(2);
-    in1_shape.collapse(2);
-    out_shape.collapse(2);
+        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+        auto vec_a_end_addr = vec_a + width_a;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        // This for loop performs 8 accumulations
+        for(; vec_a <= (vec_a_end_addr - 8);)
+        {
+            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
 
-    return Error{};
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4x2_t a00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(a00_u8)),
+                    vget_high_u16(vmovl_u8(a00_u8))
+                }
+            };
+
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
+
+            const uint16x4x4_t b10_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
+                }
+            };
+
+            const uint16x4x4_t b20_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
+                }
+            };
+
+            const uint16x4x4_t b30_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
+                }
+            };
+
+            const uint16x4x4_t b40_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
+                }
+            };
+
+            const uint16x4x4_t b50_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
+                }
+            };
+
+            const uint16x4x4_t b60_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
+                }
+            };
+
+            const uint16x4x4_t b70_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
+                }
+            };
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+            // Accumulate 1:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+            // Accumulate 2:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+            // Accumulate 3:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+            // Accumulate 4:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+            // Accumulate 5:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+            // Accumulate 6:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+            // Accumulate 7:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+            vec_a += 8;
+            matrix_b += 8 * stride_b;
+        }
+
+        // This for loop performs the left-over accumulations
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
+
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
+
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+            vec_a += 1;
+            matrix_b += stride_b;
+        }
+
+        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+        vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+        vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+        vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+    },
+    ina, inb, out);
 }
 
-std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
 {
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(id.x() > width_b)
+        {
+            return;
+        }
 
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
 
-    AccessWindowStatic     in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
-    AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
-    AccessWindowRectangle  output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+        auto vec_a_end_addr = vec_a + width_a;
 
-    bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+        // This for loop performs 8 accumulations
+        for(; vec_a <= (vec_a_end_addr - 8);)
+        {
+            const int8x8_t  a00_s8 = vld1_s8(vec_a);
+            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+            // Convert a00_s8 to int16_t and get the lower part
+            const int16x4x2_t a00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(a00_s8)),
+                    vget_high_s16(vmovl_s8(a00_s8))
+                }
+            };
 
-    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
-    return std::make_pair(err, win);
-}
-} // namespace
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
 
-NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
-{
-}
+            const int16x4x4_t b10_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
+                }
+            };
 
-void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+            const int16x4x4_t b20_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
+                }
+            };
 
-    TensorShape in1_shape = input1->info()->tensor_shape();
-    in1_shape.collapse(2);
+            const int16x4x4_t b30_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
+                }
+            };
 
-    _input0         = input0;
-    _input1         = input1;
-    _output         = output;
-    _slide_matrix_b = in1_shape[2] != 1;
+            const int16x4x4_t b40_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
+                }
+            };
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
+            const int16x4x4_t b50_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
+                }
+            };
 
-Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+            const int16x4x4_t b60_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
+                }
+            };
 
-    return Error{};
+            const int16x4x4_t b70_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
+                }
+            };
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+            // Accumulate 1:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+            // Accumulate 2:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+            // Accumulate 3:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+            // Accumulate 4:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+            // Accumulate 5:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+            // Accumulate 6:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+            // Accumulate 7:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+            vec_a += 8;
+            matrix_b += 8 * stride_b;
+        }
+
+        // This for loop performs the left-over accumulations
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
+
+            // Convert a00_s8 to uint16_t and get the lower part
+            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+            vec_a += 1;
+            matrix_b += stride_b;
+        }
+
+        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(vec_out + 0, c0.val[0]);
+        vst1q_s32(vec_out + 4, c0.val[1]);
+        vst1q_s32(vec_out + 8, c0.val[2]);
+        vst1q_s32(vec_out + 12, c0.val[3]);
+    },
+    ina, inb, out);
 }
 
 void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
@@ -176,7 +534,7 @@
             const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
             const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
 
-            // Convert a00_s8 to uint16_t and get the lower part
+            // Convert a00_u8 to uint16_t and get the lower part
             const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
 
             // Convert b00_s8 to uint16_t
@@ -355,6 +713,115 @@
     },
     ina, inb, out);
 }
+} // namespace
+
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    TensorShape in0_shape = input0->tensor_shape();
+    TensorShape in1_shape = input1->tensor_shape();
+    TensorShape out_shape = output->tensor_shape();
+
+    // Check vector-by-matrix case
+    if(out_shape[1] == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+    }
+    else
+    {
+        in0_shape.collapse(2);
+        in1_shape.collapse(2);
+        out_shape.collapse(2);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+    }
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win;
+    bool   window_changed = false;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->dimension(1) == 1))
+    {
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+        // We cannot read out-of-bound elements from matrix A as we use the left-over for loop
+        AccessWindowStatic     in0_access(input0, 0, 0, input0->tensor_shape().x(), 1);
+        AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+    else
+    {
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic     in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
+        AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+        AccessWindowRectangle  output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    }
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+
+    TensorShape in1_shape = input1->info()->tensor_shape();
+    in1_shape.collapse(2);
+
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = in1_shape[2] != 1;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+
+    return Error{};
+}
 
 void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
 {
@@ -362,48 +829,106 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
-    const size_t out_stride  = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
-
-    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(_slide_matrix_b)
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
+    if((_output->info()->dimension(1) == 1))
     {
-        win_b = window;
+        const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
+        const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
+        const auto in_b_stride    = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
+
+        // The implementation computes 16 elements per iteration
+        const int window_start_x = 16 * info.thread_id;
+        const int window_step_x  = 16 * info.num_threads;
+        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+        Window win_out(window);
+        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() >= 3)
+        {
+            win_b = window;
+        }
+        win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator ina(_input0, win_a);
+        Iterator inb(_input1, win_b);
+        Iterator out(_output, win_out);
+
+        switch(_input0->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
+        }
     }
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // The step x and step y for the output matrix has been already set using in configure()
-    Iterator ina(_input0, win_a);
-    Iterator inb(_input1, win_b);
-    Iterator out(_output, window);
-
-    const int width_b = _input1->info()->dimension(0);
-    switch(_input0->info()->data_type())
+    else
     {
-        case DataType::S8:
+        const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
+        const size_t out_stride  = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+
+        // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+
+        // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_slide_matrix_b)
         {
-            matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
-            break;
+            win_b = window;
         }
-        case DataType::U8:
-        case DataType::QASYMM8:
+        win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
+        win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        // The step x and step y for the output matrix has been already set using in configure()
+        Iterator ina(_input0, win_a);
+        Iterator inb(_input1, win_b);
+        Iterator out(_output, window);
+
+        const int width_b = _input1->info()->dimension(0);
+        switch(_input0->info()->data_type())
         {
-            matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
+            case DataType::S8:
+            {
+                matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
         }
     }
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index da5ac22..2c6515c 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -48,7 +48,7 @@
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0)
+      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
 {
 }
 
@@ -57,10 +57,9 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info()));
 
-    bool dot_product_path = false;
-
-    _a_offset = a->info()->quantization_info().offset;
-    _b_offset = b->info()->quantization_info().offset;
+    _a_offset                         = a->info()->quantization_info().offset;
+    _b_offset                         = b->info()->quantization_info().offset;
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
 #ifdef ARM_COMPUTE_AARCH64_V8_2
     // Check for DOT product instruction
@@ -69,7 +68,7 @@
 
     if(cpu_has_dotprod != 0)
     {
-        dot_product_path = true;
+        _dot_product_path = true;
 
         // Configure matrix multiply kernel
         struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -90,42 +89,54 @@
     else
 #endif /* ARM_COMPUTE_AARCH64_V8_2 */
     {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
-        // Configure interleave kernel
+        if(_run_vector_matrix_multiplication)
         {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
-            _mtx_a_reshape_kernel = std::move(k);
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(a, b, output);
+                _mm_kernel = std::move(k);
+            }
         }
-
-        // Configure transpose kernel
+        else
         {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-            k->configure(b, &_tmp_b);
-            _mtx_b_reshape_kernel = std::move(k);
-        }
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
 
-        // Configure matrix multiply kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-            k->configure(&_tmp_a, &_tmp_b, output);
-            _mm_kernel = std::move(k);
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
+            shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
+
+            // Configure interleave kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+                k->configure(a, &_tmp_a);
+                _mtx_a_reshape_kernel = std::move(k);
+            }
+
+            // Configure transpose kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+                k->configure(b, &_tmp_b);
+                _mtx_b_reshape_kernel = std::move(k);
+            }
+
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(&_tmp_a, &_tmp_b, output);
+                _mm_kernel = std::move(k);
+            }
         }
     }
 
@@ -166,7 +177,7 @@
     _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
 
     // Allocate tensors
-    if(!dot_product_path)
+    if(!_dot_product_path && !_run_vector_matrix_multiplication)
     {
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
@@ -199,8 +210,9 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
                                     "The output matrix must have the same number of columns as the matrix B");
 
-    int32_t a_offset = a->quantization_info().offset;
-    int32_t b_offset = b->quantization_info().offset;
+    int32_t a_offset                         = a->quantization_info().offset;
+    int32_t b_offset                         = b->quantization_info().offset;
+    bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
 
 #ifdef ARM_COMPUTE_AARCH64_V8_2
     // Check for DOT product instruction
@@ -215,22 +227,29 @@
     else
 #endif /* ARM_COMPUTE_AARCH64_V8_2 */
     {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->tensor_shape();
-        shape_tmp_a.set(0, a->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+        if(!run_vector_matrix_multiplication)
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->tensor_shape();
+            shape_tmp_a.set(0, a->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
 
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->tensor_shape();
-        shape_tmp_b.set(0, b->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->tensor_shape();
+            shape_tmp_b.set(0, b->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
-        TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        }
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -271,14 +290,18 @@
 {
     _memory_group.acquire();
 
-    if(_mtx_a_reshape_kernel)
+    // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
+    if(!_run_vector_matrix_multiplication && !_dot_product_path)
     {
-        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-    }
+        if(_mtx_a_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+        }
 
-    if(_mtx_b_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+        if(_mtx_b_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+        }
     }
 
     NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);