COMPMID-661: softmax-fp32 optimisation (#14)

Change-Id: I2007af1ed9dcf68065cf412aa50f73a2025b31a6
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94605
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 3219952..6efeebd 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -300,6 +300,8 @@
     { "softmax_layer_max", "softmax_layer.cl" },
     { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
+    { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
+    { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "suppress_non_maximum", "canny.cl" },
     { "tablelookup_U8", "tablelookup.cl" },
     { "tablelookup_S16", "tablelookup.cl" },
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index 5476a6e..b329118 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h
@@ -359,7 +359,12 @@
         return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), clz(sum) > dec_m); /* Saturate result if needed */ \
     }
 
+EXPQ_IMPL(qs8, qs8x2, 2)
+EXPQ_IMPL(qs8, qs8x4, 4)
+EXPQ_IMPL(qs8, qs8x8, 8)
 EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x2, 2)
+EXPQ_IMPL(qs16, qs16x4, 4)
 EXPQ_IMPL(qs16, qs16x8, 8)
 EXPQ_IMPL(qs16, qs16x16, 16)
 
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 330d67d..768f7ee 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -45,6 +45,9 @@
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
 #define CONVERT_STR(x, type) (convert_##type((x)))
 #define CONVERT(x, type) CONVERT_STR(x, type)
 
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 010135e..5bc43ef 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -57,8 +57,36 @@
 
 #endif /* FIXED_POINT_POSITION */
 
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+/* Vector size, i.e. number of vector elements. */
+#if VECTOR_SIZE == 2
+__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
+__constant uint2 idx__ = (uint2)(0, 1);
+
+#elif VECTOR_SIZE == 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
+
+#elif VECTOR_SIZE == 8
+__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
+__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+#else /* VECTOR_SIZE DEFAULT */
+#define VECTOR_SIZE 16
+#define LOG_VECTOR_SIZE 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+#endif /* VECTOR_SIZE END */
+
+// TODO (COMPMID-661): Remove if the non-fused kernels are removed
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant uint4 idx4   = (uint4)(0, 1, 2, 3);
 
 /** Identifies the maximum value across the 1st dimension.
  *
@@ -277,3 +305,462 @@
     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
     vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 }
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA_VAL;
+#endif /* BETA */
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
+
+    // Calculate max of row
+    const uint width_ = width >> LOG_VECTOR_SIZE;
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data_max    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
+    widx        = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform max reduction
+#if VECTOR_SIZE == 16
+    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+    // Store result
+    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+
+    /* Second section */
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    data = select(0, data, widx);
+    VSTORE(VECTOR_SIZE)
+    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+    sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform sum reduction
+#if VECTOR_SIZE == 16
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    const uint lid = get_local_id(0);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+#endif /* BETA */
+
+    // Define one temporary vector per work-item.
+    __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
+    // Number of elements per work-item.
+    const uint row = width / GRID_SIZE;
+    // Number of iterations per work-item.
+    const uint width_ = row >> 2;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    //TODO: Optimize this calculation (avoid %).
+    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of 4
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx        = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = max_val_vec;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec     = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+        max_local       = max_val_vec.s0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    sum1D             = 0;
+    DATA_TYPE max_val = max_local;
+
+    // Shift values, exp and sum
+    for(i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    //TODO: Optimize the calculation (avoid %).
+    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = select(0, data, widx);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        // Perform max reduction
+        sum1D.s01                        = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+        sum1D.s0                         = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+        *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+    }
+}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 1b89161..6b42e18 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -185,6 +185,137 @@
     while(window_collapsed.slide_window_slice_3D(slice));
 }
 
+/**< Grid size (obtained through auto-tuning) */
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
+/**< Vector size in the serial case (obtained through auto-tuning) */
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
+/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
+
+CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+    ARM_COMPUTE_ERROR_ON(beta != 1.0f && input->info()->data_type() != DataType::F32);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    const DataType dt                 = input->info()->data_type();
+    const size_t   reduction_dim_size = input->info()->dimension(0);
+    auto           beta_int           = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+    build_opts.add_option_if(is_data_type_fixed_point(dt),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
+    build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+
+    // Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
+    // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
+    _lws_hint                                     = cl::NullRange;
+    std::string           kernel_name             = std::string("softmax_layer_max_shift_exp_sum_serial");
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
+    unsigned int          vector_size             = std::get<1>(parallel_reduction_info);
+
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
+    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
+
+    // Configure parallel kernel if needed
+    if(std::get<0>(parallel_reduction_info))
+    {
+        kernel_name            = std::string("softmax_layer_max_shift_exp_sum_parallel");
+        bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
+        build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
+
+        // Handle boundary conditions.
+        const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
+        build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
+    }
+
+    // Create kernel.
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments. Both the kernels use the same arguments
+    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, reduction_dim_size);
+
+    // Configure window
+    const unsigned int num_elems_x = ceil_to_multiple(input->info()->tensor_shape().x(), vector_size);
+    Window             win         = calculate_max_window(*input->info(), Steps(num_elems_x));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_x);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_x);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
+{
+    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
+    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
+    return std::make_tuple(is_parallel_reduction, vector_size);
+}
+
+void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Collapse window in Z dimension
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    // Reconfigure window in case of parallel reduction
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
+    if(std::get<0>(parallel_reduction_info))
+    {
+        // To launch grid_size parallel workitems, steps.x should be modified as follows.
+        const unsigned int step = std::get<1>(parallel_reduction_info);
+        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
+    }
+
+    // Get slices
+    Window slice = window_collapsed.first_slice_window_3D();
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _max, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _sum, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}
+
 CLLogits1DNormKernel::CLLogits1DNormKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr)
 {
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index fa324ee..7268d8e 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -23,15 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _run_legacy_path(false)
 {
 }
 
@@ -48,14 +52,26 @@
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Set GPU target to kernels
+    _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
+
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
     _memory_group.manage(&_max);
     _memory_group.manage(&_sum);
 
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    // Configure kernels
+    // TODO (COMPMID-661): Remove legacy path once the new one is properly validated
+    _run_legacy_path = is_data_type_quantized_assymetric(input->info()->data_type());
+    if(_run_legacy_path)
+    {
+        _max_kernel.configure(input, &_max);
+        _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    }
+    else
+    {
+        _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    }
     _norm_kernel.configure(&_tmp, &_sum, output);
 
     // Allocate intermediate buffers
@@ -68,8 +84,16 @@
 {
     _memory_group.acquire();
 
-    CLScheduler::get().enqueue(_max_kernel, false);
-    CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    // Force to use the new fused kernel
+    if(_run_legacy_path)
+    {
+        CLScheduler::get().enqueue(_max_kernel, false);
+        CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
+    }
     CLScheduler::get().enqueue(_norm_kernel);
 
     _memory_group.release();