COMPMID-3237: Extend GEMMLowpReduction kernels to multiply reductions by a scalar value

Change-Id: If2a242f52aea753591525d30a4cb64c1a766bf8d
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2881
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 58400b1..d9d3e1a 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -124,5 +124,27 @@
     float epsilon;             /**< Lower bound value for the normalization. Defaults to 1e-12 */
     bool  use_mixed_precision; /**< Use mixed precision in case of FP16 execution. Defaults to true */
 };
+
+struct GEMMLowpReductionKernelInfo
+{
+    /** Default constructor */
+    GEMMLowpReductionKernelInfo() = default;
+    /** Constructor
+     *
+     * @param[in] k             Number of matrix columns/rows.
+     * @param[in] is_reshaped   True if the input tensor has been reshaped.
+     * @param[in] scalar        Scalar value to multiply each reduced column/row by.
+     * @param[in] mul_by_scalar True if each column/row reduction has to be multiplied by a scalar value.
+     */
+    GEMMLowpReductionKernelInfo(int32_t k, bool is_reshaped, int32_t scalar, bool mul_by_scalar)
+        : k(k), is_reshaped(is_reshaped), scalar(scalar), mul_by_scalar(mul_by_scalar)
+    {
+    }
+
+    int32_t k{ 0 };                 /**< Number of matrix columns/rows */
+    bool    is_reshaped{ false };   /**< True if the input tensor has been reshaped */
+    int32_t scalar{ 0 };            /**< Scalar value to multiply each reduced column/row by */
+    bool    mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index fb781ae..1e472f5 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,9 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
+struct GEMMLowpReductionKernelInfo;
 
 /** Common interface for all NEON reduction kernels */
 class INEGEMMLowpReductionKernel : public INEKernel
@@ -47,18 +49,23 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input       Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output      Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  k           Number of matrix A columns (or matrix B rows)
-     * @param[in]  is_reshaped True if the input tensor has been reshaped
+     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  info   Kernel metadata:
+     *                    - k            Number of matrix columns/rows depending on the type of reduction.
+     *                    - is_reshaped  True if the matrix has been reshaped.
+     *                    - scalar       Scalar value to multiply each reduced column/row by.
+     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    virtual void configure(const ITensor *input, ITensor *output, int32_t k, bool is_reshaped) = 0;
+    virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
 
 protected:
     const ITensor *_input;
     ITensor       *_output;
     int32_t        _k;
     bool           _is_reshaped;
+    int32_t        _scalar;
+    bool           _mul_by_scalar;
 };
 
 /** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
@@ -75,22 +82,28 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_a             Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  num_mtx_a_cols    Number of matrix A columns
-     * @param[in]  is_interleaved4x4 True if the matrix A has been interleaved4x4
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info           Kernel metadata:
+     *                            - k            (num_mtx_a_cols) Number of matrix A columns
+     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
+     *                            - scalar       Scalar value to multiply each reduced row by.
+     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
      */
-    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
+    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
      *
-     * @param[in] mtx_a             Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] num_mtx_a_cols    Number of matrix A columns
-     * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in] info           Kernel metadata:
+     *                           - k            (num_mtx_a_cols) Number of matrix A columns
+     *                           - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
+     *                           - scalar       Scalar value to multiply each reduced row by.
+     *                           - mul_byscalar True if each reduced column must be multiplied by a scalar value.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4);
+    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -118,22 +131,28 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  num_mtx_b_rows   Number of matrix B rows
-     * @param[in]  is_transposed1xW True if the input tensor is transposed 1xW
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info           Kernel metadata:
+     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
+     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
+     *                            - scalar       Scalar value to multiply each reduced row by.
+     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
      */
-    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
+    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
      *
-     * @param[in] mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] num_mtx_b_rows   Number of matrix B rows
-     * @param[in] is_transposed1xW True if the input tensor is transposed 1xW
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in] info           Kernel metadata:
+     *                           - k            (num_mtx_b_rows) Number of matrix B rows.
+     *                           - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
+     *                           - scalar       Scalar value to multiply each reduced row by.
+     *                           - mul_byscalar True if each reduced row must be multiplied by a scalar value.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW);
+    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index c87e806..8dc6b88 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -74,7 +74,7 @@
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
      * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
      *
-     * @param[in]  a         First input tensor  (Matrix A or Vector A). Data type supported: BLOAT16/F16/F32
+     * @param[in]  a         First input tensor  (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
      * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
      * @param[out] d         Output tensor. Data type supported: same as @p a
@@ -86,7 +86,7 @@
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
-     * @param[in]  a         First input tensor info  (Matrix or Vector A). Data types supported: BLOAT16/F16/F32
+     * @param[in]  a         First input tensor info  (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32
      * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a.
      * @param[in]  c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
      * @param[out] output    Output tensor info. Data type supported: same as @p a
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 5368384..e7da100 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -66,9 +66,9 @@
      * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
      * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                     Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BLOAT16
+     *                     Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
      * @param[out] output  Destination tensor.
-     *                     Data types supported: Same as @p weights, FP32 if @p weights is BLOAT16
+     *                     Data types supported: Same as @p weights, FP32 if @p weights is BFLOAT16
      */
     void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
@@ -76,9 +76,9 @@
      * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
      * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                    Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BLOAT16
+     *                    Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
      * @param[in] output  Destination tensor.
-     *                    Data types supported: Same as @p weights FP32 if @p weights is BLOAT16
+     *                    Data types supported: Same as @p weights FP32 if @p weights is BFLOAT16
      *
      * @return an error status
      */
@@ -140,7 +140,7 @@
 /** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions:
  *
  * -# @ref NEIm2ColKernel
- * -# @ref NEGEMM (if the data type is BLOAT16/FP16/FP32)
+ * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
  * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
  * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED)
  * -# @ref NEArithmeticAdditionKernel (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 374005d..b7e862c 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -37,26 +38,29 @@
 #include <cstddef>
 #include <cstdint>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+    }
     return Status{};
 }
 std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
 {
     const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
 
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32);
+
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
     AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
@@ -72,9 +76,14 @@
 
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+    }
     return Status{};
 }
 
@@ -82,6 +91,9 @@
 {
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);
+
     // Configure kernel window
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
@@ -98,20 +110,22 @@
 } // namespace
 
 INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
-    : _input(), _output(), _k(0), _is_reshaped(false)
+    : _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false)
 {
 }
 
-void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
 
-    _input       = mtx_a;
-    _output      = vector_sum_row;
-    _k           = num_mtx_a_cols;
-    _is_reshaped = is_interleaved4x4;
+    _input         = mtx_a;
+    _output        = vector_sum_row;
+    _k             = info.k;
+    _is_reshaped   = info.is_reshaped;
+    _scalar        = info.scalar;
+    _mul_by_scalar = info.mul_by_scalar;
 
     // Configure kernel window
     auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
@@ -119,11 +133,10 @@
     INEKernel::configure(win_config.second);
 }
 
-Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
 {
-    ARM_COMPUTE_UNUSED(num_mtx_a_cols);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first);
 
     return Status{};
 }
@@ -145,11 +158,12 @@
     Iterator in(_input, win_input);
     Iterator out(_output, collapsed_window);
 
+    const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
+
     if(_is_reshaped)
     {
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
-            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
             auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
 
             const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
@@ -194,6 +208,12 @@
                 sum_row = wrapper::vaddw(sum_row, a0_d16);
             }
 
+            // Multiply by scalar if necessary
+            if(_mul_by_scalar)
+            {
+                sum_row = wrapper::vmul(sum_row, vec_scalar);
+            }
+
             auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
 
             wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));
@@ -243,6 +263,12 @@
             sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
+            // Multiply by scalar if necessary
+            if(_mul_by_scalar)
+            {
+                sum_row *= _scalar;
+            }
+
             *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
         },
         in, out);
@@ -269,15 +295,17 @@
     }
 }
 
-void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
 
-    _input       = mtx_b;
-    _output      = vector_sum_col;
-    _k           = num_mtx_b_rows;
-    _is_reshaped = is_transposed1xW;
+    _input         = mtx_b;
+    _output        = vector_sum_col;
+    _k             = info.k;
+    _is_reshaped   = info.is_reshaped;
+    _scalar        = info.scalar;
+    _mul_by_scalar = info.mul_by_scalar;
 
     // Configure kernel window
     auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
@@ -285,10 +313,9 @@
     INEKernel::configure(win_config.second);
 }
 
-Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
 {
-    ARM_COMPUTE_UNUSED(num_mtx_b_rows);
-    ARM_COMPUTE_UNUSED(is_transposed1xW);
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
 
@@ -304,6 +331,8 @@
 
     Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
+    const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
+
     if(_is_reshaped)
     {
         Window win_input(collapsed_window);
@@ -350,6 +379,15 @@
                 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
             }
 
+            // Multiply by scalar if necessary
+            if(_mul_by_scalar)
+            {
+                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+            }
+
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
             wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
@@ -465,6 +503,15 @@
                 matrix_b += in_b_stride;
             }
 
+            // Multiply by scalar if necessary
+            if(_mul_by_scalar)
+            {
+                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+            }
+
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
             wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
@@ -495,3 +542,4 @@
             ARM_COMPUTE_ERROR("Unsupported data type");
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 8c6cee7..3417c72 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
@@ -37,7 +38,8 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
@@ -172,6 +174,9 @@
 
     if(!_fused_assembly_path)
     {
+        // Build reduction info
+        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
+
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
         if(_a_offset != 0)
         {
@@ -184,7 +189,7 @@
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -196,7 +201,7 @@
             _memory_group.manage(&_vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
+            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
         if(_fuse_output_stage)
@@ -418,13 +423,15 @@
         TensorInfo info_vector_sum_col{};
         TensorInfo info_vector_sum_row{};
 
+        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
         // Validate matrix B reduction kernel only if _a_offset is not equal to 0
         if(a_offset != 0)
         {
             info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
             // Configure Matrix B reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
         }
 
         // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -433,7 +440,7 @@
             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
             // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
         }
 
         if(fuse_output_stage)
@@ -580,3 +587,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute