COMPMID-661: softmax-uint8 implementation (#16)

Change-Id: Iad11ce70a8a0878a48e445a092035c49c926cece
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94855
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index f625672..8e2bb66 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -59,6 +59,17 @@
      * @param[in] option_false Option to add if condition is false
      */
     void add_option_if_else(bool cond, std::string option_true, std::string option_false);
+    /** Appends given build options to the current's objects options.
+     *
+     * @param[in] options Build options to append
+     */
+    void add_options(const StringSet &options);
+    /** Appends given build options to the current's objects options if a given condition is true.
+     *
+     * @param[in] cond    Condition to check
+     * @param[in] options Option to add if condition is true
+     */
+    void add_options_if(bool cond, const StringSet &options);
     /** Gets the current options list set
      *
      * @return Build options set
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index 675c462..ab550aa 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -38,7 +38,7 @@
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
@@ -60,11 +60,11 @@
     CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
+     * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
+     * @param[out] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.0
      */
     void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
 
@@ -146,11 +146,12 @@
     CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/S32/F16/F32
      * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. (Default = 1.0)
      */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output);
+    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta = 1.0f);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index d2cd76e..e7a90fa 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -37,6 +37,13 @@
  * @param[out] right_shift      Right bit shift.
  */
 arm_compute::Error calculate_quantized_multiplier_less_than_one(double multiplier, int *quant_multiplier, int *right_shift);
+/** Calculate quantized representation of multiplier having value greater than one.
+ *
+ * @param[in]  multiplier           Real multiplier.
+ * @param[out] quantized_multiplier Integer multiplier.
+ * @param[out] left_shift           Left bit shift.
+ */
+arm_compute::Error calculate_quantized_multiplier_greater_than_one(double multiplier, int *quantized_multiplier, int *left_shift);
 } // namespace quantization
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 72ef679..ab1b1ab 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -53,7 +53,7 @@
     CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
      */
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6ebdf29..94cc02a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -58,6 +58,19 @@
     (cond) ? add_option(std::move(option_true)) : add_option(std::move(option_false));
 }
 
+void CLBuildOptions::add_options(const StringSet &options)
+{
+    _build_opts.insert(options.begin(), options.end());
+}
+
+void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
+{
+    if(cond)
+    {
+        add_options(options);
+    }
+}
+
 const CLBuildOptions::StringSet &CLBuildOptions::options() const
 {
     return _build_opts;
@@ -299,8 +312,11 @@
     { "sobel_separable7x1", "sobel_filter.cl" },
     { "sobel_separable1x7", "sobel_filter.cl" },
     { "softmax_layer_max", "softmax_layer.cl" },
+    { "softmax_layer_max_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
+    { "softmax_layer_shift_exp_sum_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
+    { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "suppress_non_maximum", "canny.cl" },
@@ -587,6 +603,10 @@
 #include "./cl_kernels/softmax_layer.clembed"
     },
     {
+        "softmax_layer_quantized.cl",
+#include "./cl_kernels/softmax_layer_quantized.clembed"
+    },
+    {
         "tablelookup.cl",
 #include "./cl_kernels/tablelookup.clembed"
     },
diff --git a/src/core/CL/cl_kernels/asymm_helper.h b/src/core/CL/cl_kernels/asymm_helper.h
new file mode 100644
index 0000000..10169a9
--- /dev/null
+++ b/src/core/CL/cl_kernels/asymm_helper.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ASYMM_HELPER_H
+#define ARM_COMPUTE_ASYMM_HELPER_H
+
+// TODO These functions were implemented to be used in softmax-uint8 kernel and therefore process only vectors of length 16.
+// But they can be managed to process arbitrary vector length using VEC_DATA_TYPE(int, size) definition to be more reusable.
+
+// Algoriths for these functions were taken from
+// https://github.com/google/gemmlowp/blob/master/fixedpoint/fixedpoint.h
+// and adapted to operate on integer vectors.
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] a Input vector whose zero bits define which corresponding bits in result will be set.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+inline int16 asymm_mask_if_zero(int16 a)
+{
+    const int16 all_zeros = 0;
+    const int16 all_ones  = ~0;
+    return select(all_zeros, all_ones, a == 0);
+}
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] a Input vector whose non-zero bits define which corresponding bits in result will be set.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+inline int16 asymm_mask_if_non_zero(int16 a)
+{
+    const int16 all_zeros = 0;
+    const int16 all_ones  = ~0;
+    return select(all_zeros, all_ones, a != 0);
+}
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] if_mask  Mask defines will bit be taken from @p then_val or @p else_val depending on corresponding bit in mask is set or not.
+ * @param[in] then_val Value whose bit will be used for result when corresponding bit in @p if_mask is set.
+ * @param[in] else_val Value whose bit will be used for result when corresponding bit in @p if_mask is not set.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+inline int16 asymm_select_using_mask(int16 if_mask, int16 then_val, int16 else_val)
+{
+    return (if_mask & then_val) ^ (~if_mask & else_val);
+}
+
+/** Correctly rounded to nearest division by a power of two.
+ * Also known as a rounding arithmetic right shift.
+ *
+ * @param[in] x        Value needed to be divided by power of two.
+ * @param[in] exponent Power of two, must be positive number.
+ *
+ * @return Arithmetic right shift.
+ */
+inline int16 asymm_rounding_divide_by_pow2(int16 x, int exponent)
+{
+    int16       mask      = (1 << exponent) - 1;
+    const int16 zero      = 0;
+    const int16 one       = 1;
+    int16       threshold = (mask >> 1) + select(zero, one, x < 0);
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold);
+}
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] x        Value needed to be multiplied or divided by power of two depending on sign of @p exponent.
+ * @param[in] exponent Power of two, can be positive or negative number.
+ *
+ * @return Arithmetic left or right shift.
+ */
+inline int16 asymm_saturating_rounding_mult_by_pow2(int16 x, int exponent)
+{
+    if(exponent < 0)
+    {
+        return asymm_rounding_divide_by_pow2(x, -exponent);
+    }
+
+    const int16 min           = INT_MIN;
+    const int16 max           = INT_MAX;
+    int         threshold     = ((1 << (31 - exponent)) - 1);
+    int16       positive_mask = asymm_mask_if_non_zero(x > threshold);
+    int16       negative_mask = asymm_mask_if_non_zero(x < -threshold);
+    int16       result        = x << exponent;
+    result                    = asymm_select_using_mask(positive_mask, max, result);
+    result                    = asymm_select_using_mask(negative_mask, min, result);
+    return result;
+}
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] a First term of half-sum.
+ * @param[in] b Second term of half-sum.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+inline int16 asymm_rounding_half_sum(int16 a, int16 b)
+{
+    long16       a64       = convert_long16(a);
+    long16       b64       = convert_long16(b);
+    long16       sum       = a64 + b64;
+    const long16 one       = 1;
+    const long16 minus_one = -1;
+    long16       sign      = select(minus_one, one, sum >= 0);
+    return convert_int16((sum + sign) / 2);
+}
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ * This is equivalent to the VQRDMULH instruction in ARM NEON.
+ *
+ * @param[in] a First term of product.
+ * @param[in] b Second term of product.
+ *
+ * @return Product of two numbers.
+ */
+inline int16 asymm_saturating_rounding_doubling_high_mul(int16 a, int16 b)
+{
+    int16  overflow     = (a == b) && (a == INT_MIN);
+    long16 a_64         = convert_long16(a);
+    long16 b_64         = convert_long16(b);
+    long16 ab_64        = a_64 * b_64;
+    long16 mask1        = 1 << 30;
+    long16 mask2        = 1 - (1 << 30);
+    long16 nudge        = select(mask2, mask1, ab_64 >= 0);
+    long16 mask         = 1ll << 31;
+    int16  ab_x2_high32 = convert_int16((ab_64 + nudge) / mask);
+    return select(ab_x2_high32, INT_MAX, overflow);
+}
+
+/** Fixed-point multiplication.
+ *
+ * @param[in] a Argument 1 in fixed-point format Q(a).
+ * @param[in] b Argument 2 in fixed-point format Q(b).
+ *
+ * @return Result in fixed-point format Q(a+b).
+ */
+inline int16 asymm_mult(int16 a, int16 b)
+{
+    return asymm_saturating_rounding_doubling_high_mul(a, b);
+}
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] a Argument in fixed-point format Q0.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+inline int16 asymm_exp_on_interval_between_negative_one_quarter_and_0_excl(int16 a)
+{
+    const int16 constant_term                            = 1895147668;
+    const int16 constant_1_over_3                        = 715827883;
+    const int   k_fractional_bits                        = 31;
+    int16       x                                        = a + (1 << (k_fractional_bits - 3));
+    int16       x2                                       = asymm_mult(x, x);
+    int16       x3                                       = asymm_mult(x2, x);
+    int16       x4                                       = asymm_mult(x2, x2);
+    int16       x4_over_4                                = asymm_rounding_divide_by_pow2(x4, 2);
+    int16       x4_over_24_plus_x3_over_6_plus_x2        = asymm_mult((x4_over_4 + x3), constant_1_over_3) + x2;
+    int16       x4_over_24_plus_x3_over_6_plus_x2_over_2 = asymm_rounding_divide_by_pow2(x4_over_24_plus_x3_over_6_plus_x2, 1);
+    return constant_term + asymm_mult(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2);
+}
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] a              Argument in fixed-point format Q(k_integer_bits).
+ * @param[in] k_integer_bits Number of integer bit in argument.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+inline int16 asymm_exp_on_negative_values(int16 a, int k_integer_bits)
+{
+    const int k_fractional_bits                      = 31 - k_integer_bits;
+    int16     k_one_quarter                          = 1 << (k_fractional_bits - 2);
+    int16     mask                                   = k_one_quarter - 1;
+    int16     a_mod_quarter_minus_one_quarter        = (a & mask) - k_one_quarter;
+    int16     a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;
+    int16     result                                 = asymm_exp_on_interval_between_negative_one_quarter_and_0_excl(a_mod_quarter_minus_one_quarter_scaled);
+    int16     remainder                              = a_mod_quarter_minus_one_quarter - a;
+
+#define EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier)                                       \
+    if(k_integer_bits > Exponent)                                                                \
+    {                                                                                            \
+        const int k_shift_amount = k_integer_bits > Exponent ? k_fractional_bits + Exponent : 0; \
+        result                   = asymm_select_using_mask(                                      \
+                                                                                                 asymm_mask_if_non_zero(remainder & (1 << k_shift_amount)),                           \
+                                                                                                 asymm_mult(result, FixedPointMultiplier), result);                                   \
+    }
+    EXP_BARREL_SHIFTER(-2, 1672461947);
+    EXP_BARREL_SHIFTER(-1, 1302514674);
+    EXP_BARREL_SHIFTER(+0, 790015084);
+    EXP_BARREL_SHIFTER(+1, 290630308);
+    EXP_BARREL_SHIFTER(+2, 39332535);
+    EXP_BARREL_SHIFTER(+3, 720401);
+    EXP_BARREL_SHIFTER(+4, 242);
+#undef EXP_BARREL_SHIFTER
+
+    if(k_integer_bits > 5)
+    {
+        const int16 clamp = -(1 << (k_fractional_bits + 5));
+        result            = asymm_select_using_mask(asymm_mask_if_non_zero(a < clamp), 0, result);
+    }
+
+    const int16 Q0_one = INT_MAX;
+    return asymm_select_using_mask(asymm_mask_if_zero(a), Q0_one, result);
+}
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] a Argument in fixed-point format Q0.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+inline int16 asymm_one_over_one_plus_x_for_x_in_0_1(int16 a)
+{
+    const int16 Q0_one            = INT_MAX;
+    const int16 Q2_one            = 1 << (31 - 2);
+    int16       half_denominator  = asymm_rounding_half_sum(a, Q0_one);
+    const int16 Q2_48_over_17     = 1515870810;
+    const int16 Q2_neg_32_over_17 = -1010580540;
+    int16       x                 = Q2_48_over_17 + asymm_mult(half_denominator, Q2_neg_32_over_17);
+    for(int i = 0; i < 3; i++)
+    {
+        int16 half_denominator_times_x           = asymm_mult(half_denominator, x);
+        int16 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;
+        int16 tmp                                = asymm_mult(x, one_minus_half_denominator_times_x);
+        x                                        = x + asymm_saturating_rounding_mult_by_pow2(tmp, 2);
+    }
+    return asymm_saturating_rounding_mult_by_pow2(x, 1);
+}
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] value            Value to be rescaled.
+ * @param[in] src_integer_bits Old number of integer bits.
+ * @param[in] dst_integer_bits New number of integer bits.
+ *
+ * @return Rescaled value.
+ */
+inline int16 asymm_rescale(int16 value, int src_integer_bits, int dst_integer_bits)
+{
+    int exponent = src_integer_bits - dst_integer_bits;
+    return asymm_saturating_rounding_mult_by_pow2(value, exponent);
+}
+
+#endif // ARM_COMPUTE_ASYMM_HELPER_H
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
new file mode 100644
index 0000000..19cd983
--- /dev/null
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "asymm_helper.h"
+#include "helpers.h"
+
+#define MAX_OP(x, y, type, size) max((x), (y))
+#define ADD_OP(x, y, type, size) ((x) + (y))
+
+__constant uchar16 type_min = 0;
+__constant uint16 idx16     = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_max_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint width)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    uchar16 max_val = 0;
+
+    // Calculate max of row
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        uchar16 data = vload16(0, (__global uchar *)offset(&src, i << 4, 0));
+        max_val      = MAX_OP(data, max_val, uchar, 16);
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    uchar16 data = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));
+    uchar16 widx = convert_uchar16(((uint16)(width4 << 4) + idx16) < width);
+    max_val      = MAX_OP(max_val, select(type_min, data, widx), uchar, 16);
+#endif /* NON_MULTIPLE_OF_16 */
+
+    // Perform max reduction
+    max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, uchar, 8);
+    max_val.s0123     = MAX_OP(max_val.s0123, max_val.s4567, uchar, 4);
+    max_val.s01       = MAX_OP(max_val.s01, max_val.s23, uchar, 2);
+    max_val.s0        = MAX_OP(max_val.s0, max_val.s1, uchar, 1);
+
+    // Store result
+    *((__global uchar *)dst.ptr) = max_val.s0;
+}
+
+#if defined(DIFF_MIN)
+
+int16 mult_by_quantized_multiplier(int16 data)
+{
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    return data;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_shift_exp_sum_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(max),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    int max_val = convert_int(*((__global uchar *)offset(&max, 0, 0)));
+
+    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
+    int16 sum1D = 0;
+
+    // Shift values, exp and sum
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        uchar16 data         = vload16(0, (__global uchar *)offset(&src, i << 4, 0));
+        int16 data_fp        = convert_int16(data);
+        int16 data_diff      = data_fp - max_val;
+        int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp              = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+        data_fp              = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+        vstore16(data_diff, 0, (__global int *)offset(&dst, i << 4, 0));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    uchar16 data         = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));
+    int16 data_fp        = convert_int16(data);
+    int16 data_diff      = data_fp - max_val;
+    int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
+    data_fp              = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+    data_fp              = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+    int16 widx           = convert_int16(((uint16)(width4 << 4) + idx16) < width);
+    vstore16(data_diff, 0, (__global int *)offset(&dst, width4 << 4, 0));
+    data_fp = select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
+    sum1D   = sum1D + select(0, data_fp, widx);
+#endif /* NON_MULTIPLE_OF_16 */
+
+    // Perform min/max reduction
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, qs16, 8);
+    sum1D.s0123     = ADD_OP(sum1D.s0123, sum1D.s4567, qs16, 4);
+    sum1D.s01       = ADD_OP(sum1D.s01, sum1D.s23, qs16, 2);
+    sum1D.s0        = ADD_OP(sum1D.s0, sum1D.s1, qs16, 1);
+
+    // Calculate and store result
+    *((__global int *)sum.ptr) = sum1D.s0;
+}
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
+
+    // It will be better to calculate this in prev layer and pass here as parameter
+    uint  sum_val_u               = convert_uint(sum_val);
+    int   headroom_plus_one       = clz(sum_val_u);
+    int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
+    int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
+    int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
+    int16 shifted_scale           = asymm_one_over_one_plus_x_for_x_in_0_1(shifted_sum_minus_one);
+
+    // It was already calculated in prev layer, should be stored into tmp output and reused
+    int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
+    int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
+    int16 data           = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+
+    data = asymm_mult(shifted_scale, data);
+    data = asymm_rounding_divide_by_pow2(data, num_bits_over_unit + 31 - 8);
+    data = select(0, max(min(data, 255), 0), data_diff >= (int16)(DIFF_MIN));
+    vstore16(convert_uchar16(data), 0, (__global uchar *)offset(&dst, 0, 0));
+}
+
+#endif /* defined(DIFF_MIN) */
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 6b42e18..af4fd88 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -33,15 +33,55 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 #include <set>
 #include <string>
 
 using namespace arm_compute;
+namespace
+{
+/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
+ *
+ * Prepares these build options:
+ * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
+ * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
+ *             it defines whether the value will be taken into account or not.
+ *
+ * @param[in] build_opts  Build options to extend
+ * @param[in] input_scale Input scaling factor
+ * @param[in] beta        Exponent scaling factor beta
+ */
+CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
+{
+    // Number of integer bits in temporary fixed-point representation of current-to-max difference
+    static const int scaled_diff_int_bits = 5;
+    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
+    static const int exp_accumulation_in_bits = 12;
+
+    const double beta_multiplier = std::min(
+                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
+                                       (1ll << 31) - 1.0);
+    int input_beta_multiplier, input_beta_left_shift;
+    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
+
+    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1ll << (31 - scaled_diff_int_bits)) / (1ll << input_beta_left_shift);
+    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
+    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
+    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
+    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
+    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
+
+    return build_opts;
+}
+} // namespace
 
 void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     // Softmax across the x dimension
@@ -49,7 +89,12 @@
     output_shape.set(0, 1);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
@@ -58,29 +103,22 @@
     _input  = input;
     _output = output;
 
+    const DataType data_type = input->info()->data_type();
     // The kernel loops over all elements in steps of 16
     const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
-    else if(input->info()->data_type() == DataType::F16)
-    {
-        build_opts.emplace("-DUSE_F16");
-    }
-
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option_if(is_data_type_fixed_point(data_type),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
     // Tell the kernel that the width is not a multiple of 16
-    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
-    {
-        build_opts.emplace("-DNON_MULTIPLE_OF_16");
-    }
+    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+    std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set fixed arguments
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
@@ -107,17 +145,28 @@
 
 void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
 
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
+    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, tmp_data_type, input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, tmp_data_type, input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+    if(is_quantized_asymmetric)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(output, sum);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+    }
 
     _input  = input;
     _max    = max;
@@ -140,9 +189,12 @@
     build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
     build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
     build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
+    build_opts.add_options_if(is_quantized_asymmetric,
+                              prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts.options()));
+    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set fixed arguments
     unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
@@ -201,7 +253,6 @@
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
-    ARM_COMPUTE_ERROR_ON(beta != 1.0f && input->info()->data_type() != DataType::F32);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
@@ -321,32 +372,52 @@
 {
 }
 
-void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
+
+    // Note: output should always have a scale of 1/256 and offset 0
+    const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
+    const bool             is_quantized_asymmetric   = (input->info()->data_type() == DataType::S32);
+    const DataType         output_data_type          = is_quantized_asymmetric ? DataType::QASYMM8 : input->info()->data_type();
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(),
+                       input->info()->tensor_shape(),
+                       1,
+                       output_data_type,
+                       input->info()->fixed_point_position(),
+                       allowed_quantization_info);
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    if(!is_quantized_asymmetric)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(output->info()->quantization_info() != allowed_quantization_info);
+    }
 
     _input  = input;
     _sum    = sum;
     _output = output;
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_options_if(is_quantized_asymmetric,
+                              prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure window
     constexpr unsigned int num_elems_processed_per_iteration = 16;
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 4ba5f44..848ee56 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -29,6 +29,8 @@
 
 using namespace arm_compute::quantization;
 
+constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+
 arm_compute::Error arm_compute::quantization::calculate_quantized_multiplier_less_than_one(double multiplier,
                                                                                            int   *quant_multiplier,
                                                                                            int   *right_shift)
@@ -45,16 +47,38 @@
     }
     const double q = std::frexp(multiplier, right_shift);
     *right_shift *= -1;
-    auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
-    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > (1ll << 31));
-    if(q_fixed == (1ll << 31))
+    auto q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
+    if(q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         --*right_shift;
     }
     ARM_COMPUTE_RETURN_ERROR_ON(*right_shift < 0);
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > std::numeric_limits<int32_t>::max());
-    *quant_multiplier = static_cast<int>(q_fixed);
+    *quant_multiplier = static_cast<int32_t>(q_fixed);
 
     return arm_compute::Error{};
-}
\ No newline at end of file
+}
+
+arm_compute::Error arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(double multiplier,
+                                                                                              int   *quantized_multiplier,
+                                                                                              int   *left_shift)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < 1.f);
+    const double q       = std::frexp(multiplier, left_shift);
+    auto         q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
+    if(q_fixed == fixed_point_one_Q0)
+    {
+        q_fixed /= 2;
+        ++*left_shift;
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(*left_shift < 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > std::numeric_limits<int32_t>::max());
+    *quantized_multiplier = static_cast<int32_t>(q_fixed);
+
+    return arm_compute::Error{};
+}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index a059f9e..ff018d5 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -41,16 +41,20 @@
 
 void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->info()->data_type()) ? DataType::S32 : input->info()->data_type();
+    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), tmp_data_type, input->info()->fixed_point_position());
+    tensor_info_tmp.set_quantization_info(input->info()->quantization_info());
+    _tmp.allocator()->init(tensor_info_tmp);
 
-    TensorShape shape = input->info()->tensor_shape();
-    shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
-    _max.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->init(tensor_info_max_sum);
+    TensorShape max_sum_shape = input->info()->tensor_shape();
+    max_sum_shape.set(0, 1);
+    TensorInfo tensor_info_max(max_sum_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    tensor_info_max.set_quantization_info(input->info()->quantization_info());
+    _max.allocator()->init(tensor_info_max);
+    _sum.allocator()->init(TensorInfo(max_sum_shape, input->info()->num_channels(), tmp_data_type, input->info()->fixed_point_position()));
 
     // Set GPU target to kernels
     _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
@@ -72,7 +76,7 @@
     {
         _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
     }
-    _norm_kernel.configure(&_tmp, &_sum, output);
+    _norm_kernel.configure(&_tmp, &_sum, output, beta);
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 7842c5c..a06aa7b 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -50,9 +50,13 @@
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<int16_t> tolerance_fixed_point(2);
 
+/** Tolerance for quantized operations */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
 {
+    DataType::QASYMM8,
     DataType::F16,
     DataType::F32,
     DataType::QS8,
@@ -65,12 +69,13 @@
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::SoftmaxLayerSmallShapes(), datasets::SoftmaxLayerLargeShapes()), CNNDataTypes), shape, data_type)
 {
-    // Set fixed point position data type allowed
-    const int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
+    // Set fixed point position and quantization info if is allowed
+    const int              fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
+    const QuantizationInfo quantization_info    = is_data_type_quantized_asymmetric(data_type) ? QuantizationInfo(1.f / 255.f, 0) : QuantizationInfo();
 
     // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type, 1, fixed_point_position);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type, 1, fixed_point_position);
+    CLTensor src = create_tensor<CLTensor>(shape, data_type, 1, fixed_point_position, quantization_info);
+    CLTensor dst = create_tensor<CLTensor>(shape, data_type, 1, fixed_point_position, QuantizationInfo(1.f / 256.f, 0));
 
     ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -88,8 +93,17 @@
     CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
 
     // Validate src padding
-    const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
-    validate(src.info()->padding(), padding_src);
+    // Legacy path used only by quantized asymmetric data type TODO(COMPMID-661) : Remove when port to new path
+    if(is_data_type_quantized_asymmetric(data_type))
+    {
+        const PaddingSize padding_src = PaddingCalculator(shape.x(), 16).required_padding();
+        validate(src.info()->padding(), padding_src);
+    }
+    else
+    {
+        const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
+        validate(src.info()->padding(), padding_src);
+    }
 
     // Validate dst padding
     const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
@@ -101,7 +115,7 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -114,7 +128,7 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -130,12 +144,12 @@
 template <typename T>
 using CLSoftmaxLayerFixedPointFixture = SoftmaxValidationFixedPointFixture<CLTensor, CLAccessor, CLSoftmaxLayer, T>;
 
-TEST_SUITE(Quantized)
+TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 // Testing for fixed point position [1,6) as reciprocal limits the maximum fixed point position to 5
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::QS8)),
-                                                                                                                     framework::dataset::make("FractionalBits", 1, 6)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType",
+                                                                                                                       DataType::QS8)),
+                                                                                                               framework::dataset::make("FractionalBits", 1, 6)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed_point);
@@ -151,10 +165,10 @@
 
 TEST_SUITE(QS16)
 // Testing for fixed point position [1,14) as reciprocal limits the maximum fixed point position to 14
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::QS16)),
-                                                                                                                      framework::dataset::make("FractionalBits", 1, 14)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::QS16)),
+                                                                                                                framework::dataset::make("FractionalBits", 1, 14)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed_point);
@@ -170,6 +184,30 @@
 TEST_SUITE_END()
 TEST_SUITE_END()
 
+template <typename T>
+using CLSoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<CLTensor, CLAccessor, CLSoftmaxLayer, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::QASYMM8)),
+                                                                                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::QASYMM8)),
+                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
diff --git a/tests/validation/CPP/SoftmaxLayer.cpp b/tests/validation/CPP/SoftmaxLayer.cpp
index eb76550..8e8cc1b 100644
--- a/tests/validation/CPP/SoftmaxLayer.cpp
+++ b/tests/validation/CPP/SoftmaxLayer.cpp
@@ -112,6 +112,18 @@
     return dst;
 }
 
+template <>
+SimpleTensor<uint8_t> softmax_layer<uint8_t>(const SimpleTensor<uint8_t> &src)
+{
+    // Note: Output quantization info should always have scale = 1/256 and offset = 0
+    const QuantizationInfo output_quantization_info = QuantizationInfo(1.f / 256, 0);
+
+    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>   dst_tmp = softmax_layer<float>(src_tmp);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, output_quantization_info);
+    return dst;
+}
+
 template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src);
 template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src);
 template SimpleTensor<qint8_t> softmax_layer(const SimpleTensor<qint8_t> &src);
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index 9c8f044..9836502 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -43,27 +43,33 @@
 namespace validation
 {
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class SoftmaxValidationFixedPointFixture : public framework::Fixture
+class SoftmaxValidationGenericFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, int fractional_bits)
+    void setup(TensorShape shape, DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
     {
-        _fractional_bits = fractional_bits;
+        _fractional_bits   = fractional_bits;
+        _quantization_info = quantization_info;
 
-        _target    = compute_target(shape, data_type, fractional_bits);
-        _reference = compute_reference(shape, data_type, fractional_bits);
+        _target    = compute_target(shape, data_type, fractional_bits, quantization_info);
+        _reference = compute_reference(shape, data_type, fractional_bits, quantization_info);
     }
 
 protected:
     template <typename U>
     void fill(U &&tensor)
     {
-        if(_fractional_bits == 0)
+        if(!is_data_type_quantized(tensor.data_type()))
         {
             std::uniform_real_distribution<> distribution(-1000.f, 1000.f);
             library->fill(tensor, distribution, 0);
         }
+        else if(is_data_type_quantized_asymmetric(tensor.data_type()))
+        {
+            std::uniform_int_distribution<> distribution(0, 100);
+            library->fill(tensor, distribution, 0);
+        }
         else
         {
             const int                       one_fixed = 1 << _fractional_bits;
@@ -72,11 +78,11 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &shape, DataType data_type, int fixed_point_position = 0)
+    TensorType compute_target(const TensorShape &shape, DataType data_type, int fixed_point_position, QuantizationInfo quantization_info)
     {
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position);
-        TensorType dst = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position);
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position, quantization_info);
+        TensorType dst = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position, QuantizationInfo(1.f / 256, 0));
 
         // Create and configure function
         FunctionType smx_layer;
@@ -101,10 +107,10 @@
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, int fixed_point_position = 0)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, int fixed_point_position, QuantizationInfo quantization_info)
     {
         // Create reference
-        SimpleTensor<T> src{ shape, data_type, 1, fixed_point_position };
+        SimpleTensor<T> src{ shape, data_type, 1, fixed_point_position, quantization_info };
 
         // Fill reference
         fill(src);
@@ -112,19 +118,42 @@
         return reference::softmax_layer<T>(src);
     }
 
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    int             _fractional_bits{};
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    int              _fractional_bits{};
+    QuantizationInfo _quantization_info{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class SoftmaxValidationFixture : public SoftmaxValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>
+class SoftmaxValidationFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
     template <typename...>
     void setup(TensorShape shape, DataType data_type)
     {
-        SoftmaxValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, 0);
+        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, 0, QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class SoftmaxValidationFixedPointFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type, int fixed_point_position)
+    {
+        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, fixed_point_position, QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class SoftmaxValidationQuantizedFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info)
+    {
+        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, 0, quantization_info);
     }
 };
 } // namespace validation