Remove legacy PostOps code PostOps was the experimental interface for Dynamic Fusion. It is now replaced by the new Dynamic Fusion interface with code generation using the Compute Kernel Writer. Resolves: COMPMID-6190 Change-Id: I813b48facef2fd6f3aee332588886b4f9b3d33d8 Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10219 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>

commit: 0d27b2ee8d811d66693555ac1e7be44d93e662e2 [log] [tgz]
author: Jakub Sujak <jakub.sujak@arm.com> Thu Aug 24 14:01:20 2023 +0100
committer: Jakub Sujak <jakub.sujak@arm.com> Mon Sep 04 14:41:16 2023 +0000
tree: 8b62a464a8bb9cd46702c8b5a60f3a97e3821b41
parent: 7ff03b67ba7ce669223f4d807e18fa3efa2f729b [diff]
diff --git a/Android.bp b/Android.bp
index 885786c..dabd70c 100644
--- a/Android.bp
+++ b/Android.bp

@@ -28,12 +28,6 @@
         "src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl",
         "src/core/CL/cl_kernels/common/elementwise_unary.cl",
         "src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h",
-        "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
         "src/core/CL/cl_kernels/common/fft.cl",
         "src/core/CL/cl_kernels/common/fft_digit_reverse.cl",
         "src/core/CL/cl_kernels/common/fft_scale.cl",

diff --git a/SConscript b/SConscript
index 4eb3c25..7bc5aff 100644
--- a/SConscript
+++ b/SConscript

@@ -222,7 +222,7 @@
                 found = pattern.search(line)
                 if found:
                     # Only get the header file name and discard the relative path.
-                    # E.g. "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h" -> "fp_mixed_precision_helpers.h"
+                    # E.g. "src/core/CL/cl_kernels/activation_float_helpers.h" -> "activation_float_helpers.h"
                     include_file = found.group(1).split('/')[-1]
                     data = files_dict[include_file].file_contents
                     updated_file.extend(data)
@@ -387,9 +387,6 @@
                         'src/core/CL/cl_kernels/tile_helpers.h',
                         'src/core/CL/cl_kernels/types.h',
                         'src/core/CL/cl_kernels/warp_helpers.h',
-                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h',
-                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h',
-                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h',
                     ]
 
     # Common kernels
@@ -414,9 +411,6 @@
                        'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
                        'src/core/CL/cl_kernels/common/elementwise_unary.cl',
                        'src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
                        'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
                        'src/core/CL/cl_kernels/common/fft.cl',
                        'src/core/CL/cl_kernels/common/fft_scale.cl',

diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 305766e..2bf5dee 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h

@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS
-#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS
+#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 namespace arm_compute
@@ -60,46 +59,43 @@
 {
     GEMMKernelInfo() = default;
     GEMMKernelInfo(
-        unsigned int                                   im,
-        unsigned int                                   in,
-        unsigned int                                   ik,
-        unsigned int                                   idepth_output_gemm3d,
-        bool                                           ireinterpret_input_as_3d,
-        bool                                           ibroadcast_bias,
-        bool                                           ifp_mixed_precision,
-        bool                                           ihas_pad_y,
-        ActivationLayerInfo                            iactivation_info,
-        int                                            inmult_transpose1xW_width,
-        int                                            imult_interleave4x4_height,
-        GEMMLHSMatrixInfo                              ilhs_info,
-        GEMMRHSMatrixInfo                              irhs_info,
-        int32_t                                        ina_offset,
-        int32_t                                        inb_offset,
-        const experimental::PostOpList<ITensorInfo *> &ipost_ops = experimental::PostOpList<ITensorInfo *> {})
+        unsigned int        im,
+        unsigned int        in,
+        unsigned int        ik,
+        unsigned int        idepth_output_gemm3d,
+        bool                ireinterpret_input_as_3d,
+        bool                ibroadcast_bias,
+        bool                ifp_mixed_precision,
+        bool                ihas_pad_y,
+        ActivationLayerInfo iactivation_info,
+        int                 inmult_transpose1xW_width,
+        int                 imult_interleave4x4_height,
+        GEMMLHSMatrixInfo   ilhs_info,
+        GEMMRHSMatrixInfo   irhs_info,
+        int32_t             ina_offset,
+        int32_t             inb_offset)
         : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
           has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info),
-          rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset), post_ops(ipost_ops)
+          rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset)
     {
     }
 
-    unsigned int                            m{ 0 };                           /**< Number of LHS rows*/
-    unsigned int                            n{ 0 };                           /**< Number of RHS columns*/
-    unsigned int                            k{ 0 };                           /**< Number of LHS columns or RHS rows */
-    unsigned int                            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
-    bool                                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
-    bool                                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
-    bool                                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
-    bool                                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
-    ActivationLayerInfo                     activation_info{};                /**< Activation function to perform after the matrix multiplication */
-    int                                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
-    int                                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
-    GEMMLHSMatrixInfo                       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
-    GEMMRHSMatrixInfo                       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
-    int32_t                                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
-    int32_t                                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
-    GEMMLowpOutputStageInfo                 output_stage{};                   /**< GEMMLowp output stage information */
-    experimental::PostOpList<ITensorInfo *> post_ops{};                       /**< (EXPERIMENTAL_POST_OPS) Specifies a list of post ops to be fused after the main op. Note unsupported post ops would not be executed.
-                                                          *   If specified, automatically disable the @ref activation_info */
+    unsigned int            m{ 0 };                           /**< Number of LHS rows*/
+    unsigned int            n{ 0 };                           /**< Number of RHS columns*/
+    unsigned int            k{ 0 };                           /**< Number of LHS columns or RHS rows */
+    unsigned int            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
+    bool                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
+    bool                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
+    bool                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+    ActivationLayerInfo     activation_info{};                /**< Activation function to perform after the matrix multiplication */
+    int                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
+    int                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
+    GEMMLHSMatrixInfo       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+    GEMMRHSMatrixInfo       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
+    int32_t                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
+    int32_t                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
+    GEMMLowpOutputStageInfo output_stage{};                   /**< GEMMLowp output stage information */
 };
 
 /** Compute descriptor used by the depthwise convolution native kernel */
@@ -240,4 +236,4 @@
     bool export_rhs_to_cl_image{ false }; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
 };
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS */
+#endif // ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H

diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 12d8602..9264cef 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h

@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_CORE_TYPES
-#define ACL_ARM_COMPUTE_CORE_TYPES
+#ifndef ACL_ARM_COMPUTE_CORE_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_TYPES_H
 
 /** The following symbols have been moved to:
  * half
@@ -65,7 +65,6 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Size3D.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/core/utils/misc/Macros.h"
 #include "support/Bfloat16.h"
 
@@ -751,14 +750,14 @@
     }
 
 private:
-    std::vector<float> _min_sizes;
-    std::vector<float> _variances;
-    float              _offset;
-    bool               _flip;
-    bool               _clip;
-    std::vector<float> _max_sizes;
-    std::vector<float> _aspect_ratios;
-    Coordinates2D      _img_size;
+    std::vector<float>   _min_sizes;
+    std::vector<float>   _variances;
+    float                _offset;
+    bool                 _flip;
+    bool                 _clip;
+    std::vector<float>   _max_sizes;
+    std::vector<float>   _aspect_ratios;
+    Coordinates2D        _img_size;
     std::array<float, 2> _steps;
 };
 
@@ -1003,15 +1002,15 @@
     }
 
 private:
-    unsigned int _max_detections;
-    unsigned int _max_classes_per_detection;
-    float        _nms_score_threshold;
-    float        _iou_threshold;
-    unsigned int _num_classes;
+    unsigned int         _max_detections;
+    unsigned int         _max_classes_per_detection;
+    float                _nms_score_threshold;
+    float                _iou_threshold;
+    unsigned int         _num_classes;
     std::array<float, 4> _scales_values;
-    bool         _use_regular_nms;
-    unsigned int _detection_per_class;
-    bool         _dequantize_scores;
+    bool                 _use_regular_nms;
+    unsigned int         _detection_per_class;
+    bool                 _dequantize_scores;
 };
 
 /** Pooling Layer Information struct*/
@@ -1462,13 +1461,13 @@
     }
 
 private:
-    float _img_width;
-    float _img_height;
-    float _scale;
-    bool  _apply_scale;
-    bool  _correct_transform_coords;
+    float                _img_width;
+    float                _img_height;
+    float                _scale;
+    bool                 _apply_scale;
+    bool                 _correct_transform_coords;
     std::array<float, 4> _weights;
-    float _bbox_xform_clip;
+    float                _bbox_xform_clip;
 };
 
 /** Normalization Layer Information class */
@@ -1915,4 +1914,4 @@
 /** Class for holding information related to cropping */
 using CropInfo = Padding2D;
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_CORE_TYPES */
+#endif // ACL_ARM_COMPUTE_CORE_TYPES_H

diff --git a/arm_compute/core/experimental/IPostOp.h b/arm_compute/core/experimental/IPostOp.h
deleted file mode 100644
index 567a402..0000000
--- a/arm_compute/core/experimental/IPostOp.h
+++ /dev/null

@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
-#define ARM_COMPUTE_EXPERIMENTAL_IPOSTOP
-
-#include <memory>
-#include <numeric>
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-/** Type of Post Op */
-enum class PostOpType
-{
-    Activation,
-    Eltwise_Add,
-    Eltwise_PRelu
-};
-/** An ordered sequence of type of Post Ops */
-using PostOpTypeSequence = std::vector<PostOpType>;
-/** An elementwise n-ary operation that can be appended to and fused with (at kernel-level) other operators
- *  It contains:
- *      1. The attributes of the original operator.
- *      2. Any additional tensor argument.
- *      3. The position of the previous op's dst tensor in its argument list ( @ref prev_dst_pos )
- *
- *  For example, a series of chained ops:
- *
- *          div(src1, relu(conv(src0, weights, bias, conv_info), act_info), div_info)
- *
- *      translates to
- *
- *          dst = conv(src0, weights, bias, conv_info)  // main op
- *          dst = relu(dst, act_info)                   // previous dst is placed in the first (and only) argument
- *          dst = div(src1, dst, div_info)              // previous dst is placed in the second argument
- *
- *      which in turn translates to:
- *
- *          main op: conv(src0, weights, bias, conv_info)
- *          post op1: relu(act_info, prev_dst_pos = 0)
- *          post op2: div(div_info, src1, prev_dst_pos = 1)
- *
- *  @note: On Broadcasting
- *      For n-ary post ops, the tensor arguments must not "widen" the dst tensor of the main op
- *      For example, for a dst of shape [14, 1, 34]:
- *          * post_op_arg1 = [1, 1, 34] is allowed: broadcast in dim 0
- *          * post_op_arg1 = [14, 1, 34] is allowed: no broadcast
- *          * post_op_arg1 = [1, 1, 34] is allowed: broadcast in dims 0 and 1
- *          * post_op_arg1 = [14, 15, 34] is NOT allowed: broadcast widens the dst tensor
- *
- * @note: On Data layout
- *      All post ops are data layout agnostic. This means post ops do not have an inherent idea of "width", "height" and so on.
- *      Should we want to perform a post op with 2 tensors of different data layouts (where data layouts are significant to both),
- *      then we need to perform necessary permutation op beforehand to unify their data layout before they can be fused with a post op
- *
- *      Note although post ops themselves should be able to support any data layout, the main op they fuse to may impose
- *      additional restrictions in the presence of post ops. For example, the implementation of a gemm op may only allow
- *      NHWC data layout if post ops are provided. Such restrictions are main op implementation specific.
- *
- *  @note: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type
- *  @note: If TensorRelatedT points to a resource, IPostOp assumes that resource is valid throughout its lifetime
- *        and the lifetime of its copies. This is almost guaranteed as IPostOp is only meant to be used at configure time
- *        after the ITensor or ITensorInfo objects are already constructed
- */
-template <typename TensorRelatedT>
-struct IPostOp
-{
-    /** Get the arity of the post op
-     * @note: that this is one fewer than the arity of the original op, because we implicitly pass the previous op's dst
-     *       tensor as one of the arguments
-     */
-    size_t arity() const
-    {
-        return arguments().size();
-    }
-    /** The position of previous op's dst in current op's argument list */
-    virtual int prev_dst_pos() const = 0;
-    /** The IPostOp type */
-    virtual PostOpType type() const = 0;
-    /** The argument tensors
-     * The order of the argument tensor is strictly preserved
-     */
-    virtual std::vector<TensorRelatedT *>       arguments()       = 0;
-    virtual std::vector<const TensorRelatedT *> arguments() const = 0;
-    /** Clone method used in cases where PostOps are owned by unique_ptr
-     * @note: This performs a shallow copy of the TensorRelatedT if TensorRelatedT points to a resource
-     */
-    virtual std::unique_ptr<IPostOp<TensorRelatedT>> clone() const = 0;
-    virtual ~IPostOp()
-    {
-    }
-};
-
-/** A sequence of PostOps that can be appended to the end of other operators */
-template <typename TensorRelatedT>
-class PostOpList
-{
-public:
-    /** Constructor */
-    PostOpList() = default;
-    /** Destructor */
-    ~PostOpList() = default;
-    PostOpList(const PostOpList &other)
-    {
-        for(const auto &op : other._post_ops)
-        {
-            this->_post_ops.push_back(op->clone());
-        }
-    }
-    PostOpList &operator=(const PostOpList &other)
-    {
-        PostOpList tmp{ other };
-        std::swap(tmp, *this);
-        return *this;
-    }
-    PostOpList(PostOpList &&other) = default;
-    PostOpList &operator=(PostOpList &&other) = default;
-
-    /** Add a new post op at the end of the list */
-    template <typename OpT, typename... Args>
-    void push_back_op(Args &&... args)
-    {
-        _post_ops.push_back(std::make_unique<OpT>(std::forward<Args>(args)...));
-    }
-
-    /** Number of post ops */
-    size_t size() const
-    {
-        return _post_ops.size();
-    }
-
-    /** Total number of post ops */
-    size_t total_num_arguments() const
-    {
-        return std::accumulate(_post_ops.begin(), _post_ops.end(), 0, [](size_t op1_arity, const auto & op2)
-        {
-            return op1_arity + op2->arity();
-        });
-    }
-
-    /** Get the underlying post op list */
-    std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list()
-    {
-        return _post_ops;
-    }
-    const std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> &get_list() const
-    {
-        return _post_ops;
-    }
-
-private:
-    std::vector<std::unique_ptr<IPostOp<TensorRelatedT>>> _post_ops{};
-};
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_IPOSTOP

diff --git a/arm_compute/core/experimental/PostOps.h b/arm_compute/core/experimental/PostOps.h
deleted file mode 100644
index a5585ba..0000000
--- a/arm_compute/core/experimental/PostOps.h
+++ /dev/null

@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/function_info/ActivationLayerInfo.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-/** (EXPERIMENTAL_POST_OPS)
- * Implementation of specific IPostOps
-*/
-
-template <typename TensorRelatedT>
-struct PostOpAct : public IPostOp<TensorRelatedT>
-{
-public:
-    PostOpAct(const ActivationLayerInfo &act_info)
-        : _act_info{ act_info }
-    {
-    }
-    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
-    ~PostOpAct() override        = default;
-    PostOpAct(const PostOpAct &) = default;
-    PostOpAct &operator=(const PostOpAct &) = default;
-    PostOpAct(PostOpAct &&)                 = default;
-    PostOpAct &operator=(PostOpAct &&) = default;
-
-    int prev_dst_pos() const override
-    {
-        return 0;
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Activation;
-    }
-    std::vector<TensorRelatedT *> arguments() override
-    {
-        return {};
-    }
-    std::vector<const TensorRelatedT *> arguments() const override
-    {
-        return {};
-    }
-    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
-    {
-        return std::make_unique<PostOpAct<TensorRelatedT>>(*this);
-    }
-    ActivationLayerInfo _act_info;
-};
-
-template <typename TensorRelatedT>
-struct PostOpEltwiseAdd : public IPostOp<TensorRelatedT>
-{
-public:
-    PostOpEltwiseAdd(TensorRelatedT addend, int prev_dst_pos, ConvertPolicy policy)
-        : _addend{ addend },
-          _prev_dst_pos{ prev_dst_pos },
-          _policy{ policy }
-    {
-    }
-    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
-    ~PostOpEltwiseAdd() override               = default;
-    PostOpEltwiseAdd(const PostOpEltwiseAdd &) = default;
-    PostOpEltwiseAdd &operator=(const PostOpEltwiseAdd &) = default;
-    PostOpEltwiseAdd(PostOpEltwiseAdd &&)                 = default;
-    PostOpEltwiseAdd &operator=(PostOpEltwiseAdd &&) = default;
-    int               prev_dst_pos() const override
-    {
-        return _prev_dst_pos;
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Eltwise_Add;
-    }
-    std::vector<TensorRelatedT *> arguments() override
-    {
-        return { &_addend };
-    }
-    std::vector<const TensorRelatedT *> arguments() const override
-    {
-        return { &_addend };
-    }
-    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
-    {
-        return std::make_unique<PostOpEltwiseAdd<TensorRelatedT>>(*this);
-    }
-    TensorRelatedT _addend;
-    int            _prev_dst_pos;
-    ConvertPolicy  _policy;
-};
-
-template <typename TensorRelatedT>
-struct PostOpEltwisePRelu : public IPostOp<TensorRelatedT>
-{
-public:
-    PostOpEltwisePRelu(TensorRelatedT alpha_param, int prev_dst_pos, ConvertPolicy policy)
-        : _alpha_param{ alpha_param },
-          _prev_dst_pos{ prev_dst_pos },
-          _policy{ policy }
-    {
-    }
-    // NOTE: PostOps do not own any resources pointed to by TensorRelatedT if it's a pointer type, thus allow shallow copy
-    ~PostOpEltwisePRelu() override                 = default;
-    PostOpEltwisePRelu(const PostOpEltwisePRelu &) = default;
-    PostOpEltwisePRelu &operator=(const PostOpEltwisePRelu &) = default;
-    PostOpEltwisePRelu(PostOpEltwisePRelu &&)                 = default;
-    PostOpEltwisePRelu &operator=(PostOpEltwisePRelu &&) = default;
-    int                 prev_dst_pos() const override
-    {
-        return _prev_dst_pos;
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Eltwise_PRelu;
-    }
-    std::vector<TensorRelatedT *> arguments() override
-    {
-        return { &_alpha_param };
-    }
-    std::vector<const TensorRelatedT *> arguments() const override
-    {
-        return { &_alpha_param };
-    }
-    std::unique_ptr<IPostOp<TensorRelatedT>> clone() const override
-    {
-        return std::make_unique<PostOpEltwisePRelu<TensorRelatedT>>(*this);
-    }
-    TensorRelatedT _alpha_param;
-    int            _prev_dst_pos;
-    ConvertPolicy  _policy;
-};
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPS

diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index 1995ab0..8dd6812 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_TYPES_H
-#define ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorShape.h"
@@ -78,11 +78,6 @@
     ACL_VEC_COL_SUM = ACL_SRC_4,
     ACL_SHIFTS      = ACL_SRC_5,
     ACL_MULTIPLIERS = ACL_SRC_6,
-
-    // (EXPERIMENTAL_POST_OPS) Post ops arguments begin after everything else
-    EXPERIMENTAL_ACL_POST_OP_ARG       = 2048,
-    EXPERIMENTAL_ACL_POST_OP_ARG_FIRST = EXPERIMENTAL_ACL_POST_OP_ARG,
-    EXPERIMENTAL_ACL_POST_OP_ARG_LAST  = EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + 1024, // Max number of post op arguments
 };
 
 namespace experimental
@@ -134,4 +129,4 @@
 using MemoryRequirements = std::vector<MemoryInfo>;
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_EXPERIMENTAL_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H

diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
index daaf862..29a57a0 100644
--- a/arm_compute/function_info/GEMMInfo.h
+++ b/arm_compute/function_info/GEMMInfo.h

@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO
-#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
 
 #include "arm_compute/core/CoreTypes.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include <vector>
 
@@ -79,7 +78,6 @@
           _pretranspose_A(false),
           _pretranspose_B(false),
           _activation_info(),
-          _post_ops(),
           _fixed_format(false),
           _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
     {
@@ -99,14 +97,12 @@
      * @param[in] fast_math                   (Optional) Use a data type of shorter width to improve performance
      * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
-     * @param[in] post_ops                    (Optional) A sequence of post operations that are performed after the main operation.
      * @param[in] fixed_format                (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
      * @param[in] weight_format               (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      */
     GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
              GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false,
-             const ActivationLayerInfo &activation_info = ActivationLayerInfo(), const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *>(),
-             bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED) noexcept
+             const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
           _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
@@ -120,7 +116,6 @@
           _pretranspose_A(false),
           _pretranspose_B(false),
           _activation_info(activation_info),
-          _post_ops(post_ops),
           _fixed_format(fixed_format),
           _weight_format(weight_format)
     {
@@ -271,22 +266,6 @@
     {
         _activation_info = activation_info;
     }
-    /** Post operations to apply after the matrix multiplication
-     *
-     * @return experimental::PostOpList object
-     */
-    const experimental::PostOpList<ITensorInfo *> &post_ops() const
-    {
-        return _post_ops;
-    }
-    /** Set post ops
-     *
-     * @param[in] post_ops experimental::PostOpList object to set
-     */
-    void set_post_ops(const experimental::PostOpList<ITensorInfo *> &post_ops)
-    {
-        _post_ops = post_ops;
-    }
     /** Flag which specifies if the GEMM operation is running fixed-format kernels.
      *
      * @return True if the GEMM operation is running fixed-format kernel else false.
@@ -320,22 +299,21 @@
     }
 
 private:
-    bool                                    _is_a_reshaped;
-    bool                                    _is_b_reshaped;
-    bool                                    _reshape_b_only_on_first_run;
-    int                                     _depth_output_gemm3d;
-    bool                                    _reinterpret_input_as_3d;
-    bool                                    _retain_internal_weights;
-    GEMMLowpOutputStageInfo                 _gemmlowp_output_stage;
-    bool                                    _fast_math;
-    bool                                    _fp_mixed_precision;
-    bool                                    _broadcast_bias;
-    bool                                    _pretranspose_A;
-    bool                                    _pretranspose_B;
-    ActivationLayerInfo                     _activation_info;
-    experimental::PostOpList<ITensorInfo *> _post_ops;
-    bool                                    _fixed_format;
-    arm_compute::WeightFormat               _weight_format;
+    bool                      _is_a_reshaped;
+    bool                      _is_b_reshaped;
+    bool                      _reshape_b_only_on_first_run;
+    int                       _depth_output_gemm3d;
+    bool                      _reinterpret_input_as_3d;
+    bool                      _retain_internal_weights;
+    GEMMLowpOutputStageInfo   _gemmlowp_output_stage;
+    bool                      _fast_math;
+    bool                      _fp_mixed_precision;
+    bool                      _broadcast_bias;
+    bool                      _pretranspose_A;
+    bool                      _pretranspose_B;
+    ActivationLayerInfo       _activation_info;
+    bool                      _fixed_format;
+    arm_compute::WeightFormat _weight_format;
 };
 } //namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO */
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H

diff --git a/arm_compute/graph/DataLayerVisitor.h b/arm_compute/graph/DataLayerVisitor.h
index ac7f1c8..11d9f1d 100644
--- a/arm_compute/graph/DataLayerVisitor.h
+++ b/arm_compute/graph/DataLayerVisitor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H
-#define ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
 
 #include "arm_compute/graph/IGraphPrinter.h"
 #include "arm_compute/graph/INodeVisitor.h"
@@ -48,7 +48,6 @@
     void visit(ConvolutionLayerNode &n) override;
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(FusedConvolutionBatchNormalizationNode &n) override;
-    void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(OutputNode &n) override;
 
@@ -59,4 +58,4 @@
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DATALAYERPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H

diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index becd672..5646ea8 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019,2021 Arm Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_INODE_H
-#define ARM_COMPUTE_GRAPH_INODE_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODE_H
+#define ACL_ARM_COMPUTE_GRAPH_INODE_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/LayerDescriptors.h"
@@ -241,30 +241,19 @@
      * @return Assigned target of this node
      */
     Target assigned_target() const;
-    /** Post operator info list
-     *
-     * @return Post operator info list
-     */
-    const std::list<std::unique_ptr<ConvPostOpInfo>> &post_op_info_list() const;
-    /** Post operator info list
-     *
-     * @return Post operator info list
-     */
-    std::list<std::unique_ptr<ConvPostOpInfo>> &post_op_info_list();
 
 protected:
     friend class Graph;
 
 protected:
-    Graph                                     *_graph;             /**< Backward reference to graph owning the node */
-    NodeID                                     _id;                /**< Node ID */
-    NodeParams                                 _common_params;     /**< Node common params */
-    std::vector<TensorID>                      _outputs;           /**< Output of the node */
-    std::vector<EdgeID>                        _input_edges;       /**< Inputs edge set */
-    std::set<EdgeID>                           _output_edges;      /**< Output edge set */
-    Target                                     _assigned_target;   /**< Assigned target by the Graph executor */
-    std::list<std::unique_ptr<ConvPostOpInfo>> _post_op_info_list; /**< Post operator info list */
+    Graph                *_graph;           /**< Backward reference to graph owning the node */
+    NodeID                _id;              /**< Node ID */
+    NodeParams            _common_params;   /**< Node common params */
+    std::vector<TensorID> _outputs;         /**< Output of the node */
+    std::vector<EdgeID>   _input_edges;     /**< Inputs edge set */
+    std::set<EdgeID>      _output_edges;    /**< Output edge set */
+    Target                _assigned_target; /**< Assigned target by the Graph executor */
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODE_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODE_H

diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index 97e9533..efe191a 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_INODEVISITOR_H
-#define ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
 
 #include "arm_compute/graph/nodes/NodesFwd.h"
 
@@ -106,16 +106,6 @@
      * @param[in] n Node to visit.
      */
     virtual void visit(FusedConvolutionBatchNormalizationNode &n) = 0;
-    /** Visit FusedConvolutionBatchNormalizationWithPostOpsNode.
-     *
-     * @param[in] n Node to visit.
-     */
-    virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) = 0;
-    /** Visit FusedConvolutionWithPostOpNode.
-     *
-     * @param[in] n Node to visit.
-     */
-    virtual void visit(FusedConvolutionWithPostOpNode &n) = 0;
     /** Visit FusedDepthwiseConvolutionBatchNormalizationNode.
      *
      * @param[in] n Node to visit.
@@ -215,8 +205,6 @@
     virtual void visit(FlattenLayerNode &n) override;
     virtual void visit(FullyConnectedLayerNode &n) override;
     virtual void visit(FusedConvolutionBatchNormalizationNode &n) override;
-    virtual void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
-    virtual void visit(FusedConvolutionWithPostOpNode &n) override;
     virtual void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     virtual void visit(InputNode &n) override;
     virtual void visit(NormalizationLayerNode &n) override;
@@ -240,4 +228,4 @@
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODEVISITOR_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H

diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index 8f97bbf..9df4eba 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
-#define ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
@@ -116,12 +116,6 @@
         case NodeType::FusedConvolutionBatchNormalizationLayer:
             os << "FusedConvolutionBatchNormalizationLayer";
             break;
-        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
-            os << "FusedConvolutionBatchNormalizationLayerWithPostOpsLayer";
-            break;
-        case NodeType::FusedConvolutionWithPostOp:
-            os << "FusedConvolutionWithPostOp";
-            break;
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
             os << "FusedDepthwiseConvolutionBatchNormalizationLayer";
             break;
@@ -295,4 +289,4 @@
 }
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPE_PRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H

diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 167f738..8d49340 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h

@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_TYPES_H
-#define ARM_COMPUTE_GRAPH_TYPES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPES_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPES_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
@@ -41,32 +41,31 @@
 {
 namespace graph
 {
-using arm_compute::CLTunerMode;
 using arm_compute::CLBackendType;
+using arm_compute::CLTunerMode;
 using arm_compute::Status;
 
 using arm_compute::Coordinates;
-using arm_compute::DataType;
 using arm_compute::DataLayout;
 using arm_compute::DataLayoutDimension;
-using arm_compute::TensorShape;
-using arm_compute::Size2D;
+using arm_compute::DataType;
 using arm_compute::PermutationVector;
 using arm_compute::PixelValue;
+using arm_compute::Size2D;
+using arm_compute::TensorShape;
 
 using arm_compute::ActivationLayerInfo;
 using arm_compute::DetectionOutputLayerInfo;
 using arm_compute::DetectionPostProcessLayerInfo;
-using arm_compute::NormType;
-using arm_compute::NormalizationLayerInfo;
+using arm_compute::DimensionRoundingType;
 using arm_compute::FullyConnectedLayerInfo;
+using arm_compute::InterpolationPolicy;
+using arm_compute::NormalizationLayerInfo;
+using arm_compute::NormType;
 using arm_compute::PadStrideInfo;
 using arm_compute::PoolingLayerInfo;
 using arm_compute::PoolingType;
 using arm_compute::PriorBoxLayerInfo;
-using arm_compute::DimensionRoundingType;
-using arm_compute::InterpolationPolicy;
-using arm_compute::experimental::PostOpType;
 
 using GraphID    = unsigned int;
 using TensorID   = unsigned int;
@@ -150,55 +149,6 @@
     Disabled, /**< Fast math disabled for Convolution layer */
 };
 
-/** Convolution post operator info */
-class ConvPostOpInfo
-{
-public:
-    /** Returns post op type
-     *
-     * @return Post op type
-     */
-    virtual PostOpType type() const = 0;
-    virtual ~ConvPostOpInfo()
-    {
-    }
-};
-
-class ConvPostOpInfoActivation : public ConvPostOpInfo
-{
-public:
-    ConvPostOpInfoActivation(const ActivationLayerInfo &act)
-        : _act(act)
-    {
-    }
-    ~ConvPostOpInfoActivation() override
-    {
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Activation;
-    }
-    ActivationLayerInfo _act;
-};
-
-class ConvPostOpInfoEltwiseAdd : public ConvPostOpInfo
-{
-public:
-    ConvPostOpInfoEltwiseAdd(int arg_pos, const ConvertPolicy &policy)
-        : _prev_op_dst_pos(arg_pos), _policy(policy)
-    {
-    }
-    PostOpType type() const override
-    {
-        return PostOpType::Eltwise_Add;
-    }
-    ~ConvPostOpInfoEltwiseAdd() override
-    {
-    }
-    int           _prev_op_dst_pos;
-    ConvertPolicy _policy;
-};
-
 /** Supported nodes */
 enum class NodeType
 {
@@ -219,8 +169,6 @@
     FlattenLayer,
     FullyConnectedLayer,
     FusedConvolutionBatchNormalizationLayer,
-    FusedConvolutionWithPostOp,
-    FusedConvolutionBatchNormalizationLayerWithPostOpsLayer,
     FusedDepthwiseConvolutionBatchNormalizationLayer,
     GenerateProposalsLayer,
     L2NormalizeLayer,
@@ -278,4 +226,4 @@
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPES_H

diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 803283e..a567427 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
 
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h"
 #include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
 #include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
@@ -541,183 +538,6 @@
     return std::move(func);
 }
 
-/** Create a backend convolution layer function with post operator
- *
- * @tparam ConvolutionLayerFunctions Backend convolution functions
- * @tparam TargetInfo                Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx  Graph context
- *
- * @return Backend convolution layer function
- */
-template <typename ConvolutionLayerFunctions, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &node, GraphContext &ctx)
-{
-    validate_node<TargetInfo>(node, 4 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
-    typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
-    typename TargetInfo::TensorType *biases  = get_backing_tensor<TargetInfo>(node.input(2));
-    typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
-
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    if(is_quantized)
-    {
-        biases->info()->set_data_type(DataType::S32);
-    }
-
-    const PadStrideInfo       conv_info  = node.convolution_info();
-    const unsigned int        num_groups = node.num_groups();
-    const ActivationLayerInfo fused_act  = node.fused_activation();
-
-    experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
-
-    auto &post_op_info_list = node.post_op_info_list();
-    for(const auto &post_op_info : post_op_info_list)
-    {
-        switch(post_op_info->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                typename TargetInfo::TensorType *add_input    = get_backing_tensor<TargetInfo>(node.input(3));
-                const auto                       eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-
-    // Create and configure function (we assume that functions have been validated before creation)
-    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
-    std::unique_ptr<IFunction>      func;
-    std::string                     func_name;
-
-    // Fuse convolution with post ops is only supported for conv1x1, which is only implemented as gemmconv2d
-    std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
-                                    std::string("GEMMConvolutionLayer"), mm,
-                                    input, weights, biases, output, conv_info,
-                                    WeightsInfo(), Size2D(1U, 1U), fused_act, num_groups, post_ops);
-
-    // Log info
-    std::ostringstream qss;
-    if(is_quantized)
-    {
-        qss << " Input QuantInfo: " << input->info()->quantization_info()
-            << " Weights QuantInfo: " << weights->info()->quantization_info()
-            << " Output QuantInfo: " << output->info()->quantization_info();
-    }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Groups: " << num_groups
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << " Post ops" << post_ops
-                               << std::endl);
-    return std::move(func);
-}
-
-/** Create a backend convolution batch normalization layer function with post operator
- *
- * @tparam FusedLayerTypes           Backend convolution functions
- * @tparam TargetInfo                Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx  Graph context
- *
- * @return Backend fused convolution with batch normalization layer function
- */
-template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_with_post_op(FusedConvolutionBatchNormalizationWithPostOpsNode &node, GraphContext &ctx)
-{
-    validate_node<TargetInfo>(node, 8 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
-    typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
-    typename TargetInfo::TensorType *biases  = get_backing_tensor<TargetInfo>(node.input(2));
-    typename TargetInfo::TensorType *mean    = get_backing_tensor<TargetInfo>(node.input(3));
-    typename TargetInfo::TensorType *var     = get_backing_tensor<TargetInfo>(node.input(4));
-    typename TargetInfo::TensorType *beta    = get_backing_tensor<TargetInfo>(node.input(5));
-    typename TargetInfo::TensorType *gamma   = get_backing_tensor<TargetInfo>(node.input(6));
-
-    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
-
-    const PadStrideInfo conv_info  = node.convolution_info();
-    const unsigned int  num_groups = node.num_groups();
-    const bool          fast_math  = node.fast_math_hint() == FastMathHint::Enabled;
-    const float         epsilon    = node.epsilon();
-
-    experimental::PostOpList<typename TargetInfo::TensorType *> post_ops;
-
-    auto &post_op_info_list = node.post_op_info_list();
-    for(const auto &post_op_info : post_op_info_list)
-    {
-        switch(post_op_info->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto act_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoActivation *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpAct<typename TargetInfo::TensorType *>>(act_info->_act);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                typename TargetInfo::TensorType *add_input    = get_backing_tensor<TargetInfo>(node.input(3));
-                const auto                       eltwise_info = utils::cast::polymorphic_downcast<const ConvPostOpInfoEltwiseAdd *>(post_op_info.get());
-                post_ops.template push_back_op<experimental::PostOpEltwiseAdd<typename TargetInfo::TensorType *>>(add_input, eltwise_info->_prev_op_dst_pos, eltwise_info->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-
-    // Create and configure function (we assume that functions have been validated before creation)
-    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
-    std::unique_ptr<IFunction>      func;
-    std::string                     func_name;
-
-    using FType = FusedConvolutionBatchNormalizationWithPostOpsFunction<TargetInfo, FusedLayerTypes>;
-
-    // Create and configure function
-    std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedConvolutionBatchNormalizationLayerWithPostOpsLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, post_ops);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Post Ops:" << post_ops
-                               << std::endl);
-    return std::move(func);
-}
-
 /** Create a backend deconvolution layer function
  *
  * @tparam DeconvolutionLayerFunction Backend deconvolution function
@@ -2025,4 +1845,4 @@
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H

diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
deleted file mode 100644
index 10f2e5c..0000000
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationWithPostOpsFunction.h
+++ /dev/null

@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** Wrapper function to first apply {NE, CL}BatchNormalizationLayer on the weights and then run {NE, CL}ConvolutionLayer with the modified weights */
-template <typename TargetInfo, typename FusedLayerTypes>
-class FusedConvolutionBatchNormalizationWithPostOpsFunction : public IFunction
-{
-public:
-    using TensorType         = typename TargetInfo::TensorType;
-    using TensorConcreteType = typename TargetInfo::TensorConcreteType;
-
-    FusedConvolutionBatchNormalizationWithPostOpsFunction(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-        : _conv_layer(memory_manager), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
-    {
-    }
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input      Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  bias       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                        Data type supported: Should match @p input data type.
-     * @param[out] output     Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                        Data types supported: Same as @p input.
-     * @param[in]  mean       Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  var        Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  beta       Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]  gamma      Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]  epsilon    Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]  conv_info  Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  num_groups Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  fast_math  Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                        available which may introduce a drop of accuracy as well. Default is false
-     * @param[in]  post_ops   A sequence of post operations that are performed after the main operation.
-     *
-     */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math,
-                   const arm_compute::experimental::PostOpList<TensorType *> &post_ops = experimental::PostOpList<TensorType *> {})
-    {
-        // We don't run any validate, as we assume that the layers have been already validated
-        const bool        has_bias = (bias != nullptr);
-        const TensorType *bias_to_use;
-
-        // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
-        // as batch normalization might end up with a bias != 0
-        if(has_bias)
-        {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
-            bias_to_use = bias;
-        }
-        else
-        {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon);
-            bias_to_use = &_fused_bias;
-        }
-
-        ActivationLayerInfo fused_act = ActivationLayerInfo(); // Passing an empty ActivationLayerInfo.
-        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups, post_ops);
-
-        if(!has_bias)
-        {
-            _fused_bias.allocator()->allocate();
-        }
-    }
-
-    // Inherited methods overridden:
-    void run()
-    {
-        prepare();
-        _conv_layer.run();
-    }
-
-    void prepare()
-    {
-        if(!_is_prepared)
-        {
-            _fused_batch_norm_layer.run();
-            _is_prepared = true;
-        }
-    }
-
-private:
-    typename FusedLayerTypes::ConvolutionLayer       _conv_layer;
-    typename FusedLayerTypes::FuseBatchNormalization _fused_batch_norm_layer;
-    TensorConcreteType                               _fused_bias;
-    bool                                             _is_prepared;
-};
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_WITH_POST_OPS_FUNCTION_H */

diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 89dccd8..71a6201 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
 
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/Tensor.h"
@@ -183,42 +183,6 @@
     return status;
 }
 
-/** Validates a Convolution layer node
- *
- * @tparam GEMMConvolutionLayer      GEMM Convolution layer function type
- *
- * @param[in] node Node to validate
- *
- * @return Status
- */
-template <typename GEMMConvolutionLayer>
-Status validate_fused_convolution_with_post_op(FusedConvolutionWithPostOpNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating fused ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
-
-    // Extract IO and info
-    arm_compute::ITensorInfo *input   = get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *weights = get_backing_tensor_info(node.input(1));
-    arm_compute::ITensorInfo *biases  = get_backing_tensor_info(node.input(2));
-    arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
-
-    if(is_data_type_quantized_asymmetric(input->data_type()))
-    {
-        biases->set_data_type(DataType::S32);
-    }
-
-    const PadStrideInfo conv_info = node.convolution_info();
-    //const ConvolutionMethod conv_algorithm = node.convolution_method();
-    //const bool              fast_math      = node.fast_math_hint() == FastMathHint::Enabled;
-    const unsigned int num_groups = node.num_groups();
-
-    // Validate function
-    return GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                          WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), num_groups);
-}
-
 /** Validates a Depthwise Convolution layer node
  *
  * @tparam DepthwiseConvolutionLayer    Default Depthwise Convolution layer type
@@ -775,4 +739,4 @@
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H

diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
deleted file mode 100644
index a42e06d..0000000
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
-#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_BATCH_NORMALIZATION_WITH_POST_OPS_NODE_H
-
-#include "arm_compute/graph/INode.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Batch Normalization node */
-class FusedConvolutionBatchNormalizationWithPostOpsNode final : public INode
-{
-public:
-    /** Constructor
-     *
-     * @param[in] epsilon        Epsilon parameter.
-     * @param[in] info           Convolution layer attributes.
-     * @param[in] num_groups     (Optional) Number of groups (Defaults to 1)
-     * @param[in] method         (Optional) Convolution method to use
-     * @param[in] fast_math_hint (Optional) Fast math hint
-     */
-    FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
-                                                      unsigned int      num_groups     = 1,
-                                                      ConvolutionMethod method         = ConvolutionMethod::Default,
-                                                      FastMathHint      fast_math_hint = FastMathHint::Disabled);
-
-    /** Epsilon parameter accessor
-     *
-     * @return Epsilon parameter
-     */
-    float epsilon() const;
-
-    /** Computes convolution output descriptor
-     *
-     * @param[in] input_descriptor   Input descriptor
-     * @param[in] weights_descriptor Weights descriptor
-     * @param[in] info               Convolution operation attributes
-     *
-     * @return Output descriptor
-     */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const TensorDescriptor &weights_descriptor,
-                                                      const PadStrideInfo    &info);
-
-    /** Sets the convolution layer method to use
-     *
-     * @param[in] method Method to use for convolution
-     */
-    void set_convolution_method(ConvolutionMethod method);
-
-    /** Number of groups in convolution accessor
-     *
-     * @return Number of groups in convolution
-     */
-    unsigned int num_groups() const;
-
-    /** Convolution layer method accessor
-     *
-     * @note This is an indication on which convolution layer implementation to use,
-     *       if it fails to be created the library's heuristic approach will be used
-     *
-     * @return Convolution layer method to be used by the node
-     */
-    ConvolutionMethod convolution_method() const;
-
-    /** Sets the fast math hint
-     *
-     * @param[in] hint Hint to use for convolution
-     */
-    void set_fast_math_hint(FastMathHint hint);
-
-    /** Fast math hint accessor
-     *
-     * @return Fast math hint to be used by the node
-     */
-    FastMathHint fast_math_hint() const;
-
-    /** Convolution metadata accessor
-     *
-     * @return Convolution information
-     */
-    PadStrideInfo convolution_info() const;
-
-    // Inherited overridden methods:
-    NodeType         type() const override;
-    bool             forward_descriptors() override;
-    TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
-
-public:
-    static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer;
-
-private:
-    float _epsilon;
-
-    PadStrideInfo     _info;
-    unsigned int      _num_groups;
-    ConvolutionMethod _method;
-    FastMathHint      _fast_math_hint;
-};
-
-} // namespace graph
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BATCH_NORMALIZATION_LAYER_NODE_H */

diff --git a/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h b/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h
deleted file mode 100644
index 6048994..0000000
--- a/arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h
+++ /dev/null

@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H
-#define ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H
-
-#include "arm_compute/graph/INode.h"
-
-#include <list>
-
-namespace arm_compute
-{
-namespace graph
-{
-/** Convolution node */
-class FusedConvolutionWithPostOpNode final : public INode
-{
-public:
-    /** Constructor
-     *
-     * @param[in] info           Convolution layer attributes
-     * @param[in] num_groups     (Optional) Number of groups (Defaults to 1)
-     * @param[in] method         (Optional) Convolution method to use
-     * @param[in] fast_math_hint (Optional) Fast math hint
-     * @param[in] out_quant_info (Optional) Output quantization info
-     */
-    FusedConvolutionWithPostOpNode(PadStrideInfo     info,
-                                   unsigned int      num_groups     = 1,
-                                   ConvolutionMethod method         = ConvolutionMethod::Default,
-                                   FastMathHint      fast_math_hint = FastMathHint::Disabled,
-                                   QuantizationInfo  out_quant_info = QuantizationInfo());
-    /** Sets the convolution layer method to use
-     *
-     * @param[in] method Method to use for convolution
-     */
-    void set_convolution_method(ConvolutionMethod method);
-    /** Convolution layer method accessor
-     *
-     * @note This is an indication on which convolution layer implementation to use,
-     *       if it fails to be created the library's heuristic approach will be used
-     *
-     * @return Convolution layer method to be used by the node
-     */
-    ConvolutionMethod convolution_method() const;
-    /** Sets the fast math fast hint
-     *
-     * @param[in] hint Hint to use for convolution
-     */
-    void set_fast_math_hint(FastMathHint hint);
-    /** Fast math hint accessor
-     *
-     * @return Fast math hint to be used by the node
-     */
-    FastMathHint fast_math_hint() const;
-    /** Convolution metadata accessor
-     *
-     * @return Convolution information
-     */
-    PadStrideInfo convolution_info() const;
-    /** Number of groups in convolution accessor
-     *
-     * @return Number of groups in convolution
-     */
-    unsigned int num_groups() const;
-    /** Returns fused activation
-     *
-     * @return Fused activation
-     */
-    ActivationLayerInfo fused_activation() const;
-    /** Sets fused activation
-     *
-     * @param[in] fused_activation Fused activation to set
-     */
-    void set_fused_activation(ActivationLayerInfo fused_activation);
-    /** Sets convolution info
-     *
-     * @param[in] info Convolution info to set
-     */
-    void set_convolution_info(PadStrideInfo info);
-    /** Computes convolution output descriptor
-     *
-     * @param[in] input_descriptor   Input descriptor
-     * @param[in] weights_descriptor Weights descriptor
-     * @param[in] info               Convolution operation attributes
-     *
-     * @return Output descriptor
-     */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const TensorDescriptor &weights_descriptor,
-                                                      const PadStrideInfo    &info);
-
-    // Inherited overridden methods:
-    NodeType         type() const override;
-    bool             forward_descriptors() override;
-    TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
-
-public:
-    static constexpr NodeType node_type = NodeType::FusedConvolutionWithPostOp;
-
-private:
-    PadStrideInfo       _info;
-    unsigned int        _num_groups;
-    ConvolutionMethod   _method;
-    FastMathHint        _fast_math_hint;
-    QuantizationInfo    _out_quant_info;
-    ActivationLayerInfo _fused_activation;
-};
-} // namespace graph
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_GRAPH_FUSED_CONVOLUTION_WITH_POST_OP_NODE_H */

diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index 3887eae..ae9f177 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_NODES_H
-#define ARM_COMPUTE_GRAPH_NODES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
 
 #include "arm_compute/graph/nodes/ActivationLayerNode.h"
 #include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
@@ -43,8 +43,6 @@
 #include "arm_compute/graph/nodes/FlattenLayerNode.h"
 #include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 #include "arm_compute/graph/nodes/InputNode.h"
@@ -70,4 +68,4 @@
 #include "arm_compute/graph/nodes/StackLayerNode.h"
 #include "arm_compute/graph/nodes/StridedSliceLayerNode.h"
 
-#endif /* ARM_COMPUTE_GRAPH_NODES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H

diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index f1576d6..580f339 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_NODES_FWD_H
-#define ARM_COMPUTE_GRAPH_NODES_FWD_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
 
 namespace arm_compute
 {
@@ -49,9 +49,7 @@
 class FlattenLayerNode;
 class FullyConnectedLayerNode;
 class FusedConvolutionBatchNormalizationNode;
-class FusedConvolutionWithPostOpNode;
 class FusedDepthwiseConvolutionBatchNormalizationNode;
-class FusedConvolutionBatchNormalizationWithPostOpsNode;
 class GenerateProposalsLayerNode;
 class InputNode;
 class L2NormalizeLayerNode;
@@ -77,4 +75,4 @@
 class StridedSliceLayerNode;
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_NODES_FWD_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H

diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index 63b8927..564aecf 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019,2021 Arm Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
-#define ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
 
 #include "arm_compute/graph/IGraphPrinter.h"
 
@@ -57,8 +57,6 @@
     void visit(DepthwiseConvolutionLayerNode &n) override;
     void visit(EltwiseLayerNode &n) override;
     void visit(FusedConvolutionBatchNormalizationNode &n) override;
-    void visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n) override;
-    void visit(FusedConvolutionWithPostOpNode &n) override;
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(NormalizationLayerNode &n) override;
     void visit(PoolingLayerNode &n) override;
@@ -106,4 +104,4 @@
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H

diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 8c9e45d..77bf48d 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h

@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -120,11 +119,9 @@
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops         (Optional) A sequence of post operations that are performed after the main operation.
      */
     void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1,
-                   const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context  The compile context to be used.
@@ -144,11 +141,10 @@
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops         (Optional) A sequence of post operations that are performed after the main operation.
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                   unsigned int num_groups = 1, const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+                   unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -167,13 +163,12 @@
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
      * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in] post_ops         (Optional) A sequence of post operations that are performed after the main operation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                            const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                           unsigned int num_groups = 1);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -203,5 +198,5 @@
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
-#endif /* ARM_COMPUTE_CLCONVOLUTIONLAYER_H */
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H

diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 9827340..4bafef2 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h

@@ -21,10 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
@@ -95,11 +94,9 @@
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops     (Optional) A sequence of post operations that are performed after the main operation.
      */
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
-                   const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -118,12 +115,10 @@
      * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops        (Optional) A sequence of post operations that are performed after the main operation.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                    const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
-                   const experimental::PostOpList<ICLTensor *> &post_ops = experimental::PostOpList<ICLTensor *> {});
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -141,13 +136,11 @@
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     * @param[in]  post_ops     (Optional) A sequence of post operations that are performed after the main operation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1,
-                           const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;
@@ -158,4 +151,4 @@
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H

diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 630f533..05f172b 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h

@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
-#define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
@@ -58,24 +58,22 @@
 {
     Conv2dInfo() = default;
 
-    Conv2dInfo(const PadStrideInfo                           &conv_info,
-               const Size2D                                  &dilation,
-               const ActivationLayerInfo                     &act_info,
-               bool                                           enable_fast_math,
-               unsigned int                                   num_groups,
-               const experimental::PostOpList<ITensorInfo *> &post_ops     = experimental::PostOpList<ITensorInfo *> {},
-               const WeightsInfo                             &weights_info = WeightsInfo())
-        : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups), post_ops(post_ops), weights_info(weights_info)
+    Conv2dInfo(const PadStrideInfo       &conv_info,
+               const Size2D              &dilation,
+               const ActivationLayerInfo &act_info,
+               bool                       enable_fast_math,
+               unsigned int               num_groups,
+               const WeightsInfo         &weights_info = WeightsInfo())
+        : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups), weights_info(weights_info)
     {
     }
 
-    PadStrideInfo                           conv_info{};
-    Size2D                                  dilation{ 1U, 1U };
-    ActivationLayerInfo                     act_info{};
-    bool                                    enable_fast_math{ false };
-    unsigned int                            num_groups{ 1 };
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    WeightsInfo                             weights_info{};
+    PadStrideInfo       conv_info{};
+    Size2D              dilation{ 1U, 1U };
+    ActivationLayerInfo act_info{};
+    bool                enable_fast_math{ false };
+    unsigned int        num_groups{ 1 };
+    WeightsInfo         weights_info{};
 };
 
 /** Descriptor used by the 3d Convolution function */
@@ -102,4 +100,4 @@
 };
 
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H

diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 04ee10b..0142497 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox

@@ -48,6 +48,7 @@
    - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface:
      - @ref experimental::dynamic_fusion::GpuCkwResize
  - Update OpenCL™ API headers to v2023.04.17.
+ - Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface.
  - Performance optimizations:
    - Optimize @ref cpu::CpuReshape
  - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface with support for FP16/FP32 only:

diff --git a/scripts/format_code.py b/scripts/format_code.py
index fa572cf..94c49fd 100755
--- a/scripts/format_code.py
+++ b/scripts/format_code.py

@@ -60,7 +60,9 @@
     "/convolution/",
     "/arm_gemm/",
     "/arm_conv/",
-    "compute_kernel_writer/"
+    "compute_kernel_writer/",
+    "SConscript",
+    "SConstruct"
 ]
 
 def adjust_copyright_year(copyright_years, curr_year):

diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f508b7e..a02739f 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel

@@ -72,8 +72,6 @@
 	"graph/nodes/FlattenLayerNode.cpp",
 	"graph/nodes/FullyConnectedLayer.cpp",
 	"graph/nodes/FusedConvolutionBatchNormalizationNode.cpp",
-	"graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp",
-	"graph/nodes/FusedConvolutionWithPostOpNode.cpp",
 	"graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp",
 	"graph/nodes/GenerateProposalsLayerNode.cpp",
 	"graph/nodes/InputNode.cpp",

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7640923..39fba86 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt

@@ -73,8 +73,6 @@
 	graph/nodes/FlattenLayerNode.cpp
 	graph/nodes/FullyConnectedLayer.cpp
 	graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
-	graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
-	graph/nodes/FusedConvolutionWithPostOpNode.cpp
 	graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
 	graph/nodes/GenerateProposalsLayerNode.cpp
 	graph/nodes/InputNode.cpp

diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 03f7869..7e56a3b 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp

@@ -23,16 +23,14 @@
  */
 #include "src/core/CL/CLUtils.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "support/StringSupport.h"
 
-#include "src/core/experimental/PostOpUtils.h"
-
 namespace arm_compute
 {
 cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType image_type)
@@ -40,7 +38,7 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
     const cl::Context &ctx    = CLKernelLibrary::get().context();
-    const cl::Buffer  &buffer = tensor->cl_buffer();
+    const cl::Buffer &buffer = tensor->cl_buffer();
     const ITensorInfo *info   = tensor->info();
     ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
                              "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
@@ -113,112 +111,4 @@
 
     return cl::Image2D(cl_image);
 }
-
-namespace experimental
-{
-PostOpCLKernelUtils::PostOpCLKernelUtils(const Config &supported_config)
-    : _supported_config(supported_config)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(supported_config.empty(), "Empty PostOp CL kernel support configuration is not allowed");
-    for(auto it = _supported_config.begin(); it != _supported_config.end(); ++it)
-    {
-        auto post_op_sequence = it->first;
-        auto post_op_slots    = std::get<1>(it->second);
-        ARM_COMPUTE_ERROR_ON_MSG(post_op_sequence.size() != post_op_slots.size(), "The number of PostOps must be the same as that of the assigned slots");
-    }
-}
-
-bool PostOpCLKernelUtils::are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
-    for(const auto &op : post_ops.get_list())
-    {
-        for(const auto &tensor : op->arguments())
-        {
-            const TensorShape &out_shape = TensorShape::broadcast_shape(dst->tensor_shape(), (*tensor)->tensor_shape());
-            // All post ops must be elementwise and must not alter the shape of the original dst tensor after broadcasting
-            if(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0))
-            {
-                return false;
-            }
-            // NOTE: Kernel limitation: currently only the following broadcasting types are supported:
-            //  1. Post op arg is scalar, broadcast in both first and second dims
-            //  2. Post op arg is of shape: second dim=1, first dim=N, broadcast only in second dim
-            //  This means this case: Post op arg is of shape: second dim=M, first dim=1, broadcast only in first dim, is NOT supported
-            if(dst->dimension(0) > 1 && dst->dimension(1) > 1 && (*tensor)->dimension(0) == 1 && (*tensor)->dimension(1) > 1)
-            {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool PostOpCLKernelUtils::is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const
-{
-    if(post_ops.size() == 0)
-    {
-        return true; // Always support cases where no post op is specified
-    }
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-
-    return _supported_config.find(post_op_sequence) != _supported_config.end();
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_opts, const PostOpList<ITensorInfo *> &post_ops) const
-{
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-    const auto slots            = std::get<1>(_supported_config.at(post_op_sequence));
-    for(size_t post_op_id = 0; post_op_id < post_ops.size(); ++post_op_id)
-    {
-        const auto &post_op     = post_ops.get_list().at(post_op_id);
-        const auto  slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]);
-        if(post_op->type() == experimental::PostOpType::Activation)
-        {
-            const auto _post_op  = utils::cast::polymorphic_downcast<const experimental::PostOpAct<ITensorInfo *> *>(post_op.get());
-            const auto act_type  = slot_prefix + "_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(_post_op->_act_info.activation()));
-            const auto act_a_val = slot_prefix + "_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.a());
-            const auto act_b_val = slot_prefix + "_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(_post_op->_act_info.b());
-            build_opts.add_option(act_type);
-            build_opts.add_option(act_a_val);
-            build_opts.add_option(act_b_val);
-        }
-        else if(post_op->type() == experimental::PostOpType::Eltwise_Add)
-        {
-            size_t     arg_id     = 1;
-            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=ADD" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
-            build_opts.add_option(eltwise_op);
-            for(const auto &tensor : post_op->arguments())
-            {
-                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
-                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
-                build_opts.add_option(height);
-                build_opts.add_option(width);
-                ++arg_id;
-            }
-        }
-        else if(post_op->type() == experimental::PostOpType::Eltwise_PRelu)
-        {
-            size_t     arg_id     = 1;
-            const auto eltwise_op = slot_prefix + "_ELTWISE_OP=PRELU" + "_X_POS_" + support::cpp11::to_string(post_op->prev_dst_pos());
-            build_opts.add_option(eltwise_op);
-            for(const auto &tensor : post_op->arguments())
-            {
-                const auto height = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_HEIGHT=" + support::cpp11::to_string((*tensor)->dimension(1));
-                const auto width  = slot_prefix + "_ELTWISE_ARG" + support::cpp11::to_string(arg_id) + "_WIDTH=" + support::cpp11::to_string((*tensor)->dimension(0));
-                build_opts.add_option(height);
-                build_opts.add_option(width);
-                ++arg_id;
-            }
-        }
-    }
-}
-
-void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const
-{
-    const auto post_op_sequence = get_post_op_sequence(post_ops);
-    const auto postfix          = std::get<0>(_supported_config.at(post_op_sequence));
-    kernel_name += postfix;
-}
-} // namespace experimental
-
 } // namespace arm_compute

diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index e3f12d4..f0e79bc 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h

@@ -22,11 +22,10 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_CL_CLUTILS_H
-#define ARM_COMPUTE_CL_CLUTILS_H
+#ifndef ACL_SRC_CORE_CL_CLUTILS_H
+#define ACL_SRC_CORE_CL_CLUTILS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 
 #include <map>
 
@@ -74,88 +73,6 @@
  * @return cl::Image2D object
  */
 cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+} // namespace arm_compute
 
-namespace experimental
-{
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-
-/** Manage validation, building and configurations of PostOp CL kernels */
-class PostOpCLKernelUtils final
-{
-public:
-    /** CL kernel name postfix for post ops */
-    using NamePostfix = std::string;
-    /** CL kernels that supports post ops assign each post op to a 'slot', in accordance with the postfix
-     * For example, for a kernel with postfix '_act_prelu_eltwiseadd', there are 3 slots
-     * slot 1: (unary) activation, slot 2: pRelu, slot 3: elementwise addition
-     *
-     * Some kernels may allow some slots to be optional, to support multiple combinations of post op sequences.
-     * In such cases, we need to explicitly set up a mapping between each post op and the slots for that kernel.
-     * For example, suppose we have 2 kernels with postfixes: _eltwiseadd_prelu, _act_eltwiseadd_act_prelu, where the activations in the
-     * second kernel are optional. Say we want to support an eltwise addition, followed by a prelu (sequence { eltwiseadd, prelu }).
-     * Now we can choose which one of the 2 kernels to use, since they both support this post op sequence.
-     * We can either:
-     *  1. assign the elementwise to slot 1 and prelu to slot 2 of kernel 1
-     *  { { Eltwise_Add, PRelu } -> {"_eltwise_act", {1, 2} } } or
-     *  2. assign the elementwise to slot 2 and prelu to slot 4 of kernel 1
-     *  { { Eltwise_Add, PRelu } -> {"_act_eltwiseadd_act_prelu", {2, 4} } }
-     */
-    using Slots  = std::vector<unsigned int>;
-    using Config = std::map<PostOpTypeSequence, std::tuple<NamePostfix, Slots>>;
-
-public:
-    explicit PostOpCLKernelUtils(const Config &config);
-
-    /** Check if post op argument tensor shapes are compliant
-     * All post ops must not alter the shape of the original dst tensor (even after broadcasting)
-     *
-     * @param[in] dst      Dst tensor to apply the post ops to
-     * @param[in] post_ops Post ops
-     *
-     * @return true if shapes are compliant and false otherwise
-     */
-    static bool are_post_op_shapes_compliant(const ITensorInfo *dst, const experimental::PostOpList<ITensorInfo *> &post_ops);
-    /** Check if the post op sequence is supported in the current configuration
-     *
-     * @param[in] post_ops Post ops
-     *
-     * @return true if the post op sequence is supported and false otherwise
-     */
-    bool is_post_op_sequence_supported(const PostOpList<ITensorInfo *> &post_ops) const;
-    /** Helper function to set PostOp related build options
-     * @note Convention
-     *      1. Each post op "slot" is prefixed with "P<slot number>", followed by the usual parameters for that post op.
-     *      E.g. If the first slot is an activation, we need to pass 3 definitions in this way:
-     *          -P1_ACTIVATION_TYPE=...  -P1_ACTIVATION_A_VAL=...   -P1_ACTIVATION_B_VAL=...
-     *
-     *      2. For multi-ary post ops, to pass the position of the previous op's dest tensor,
-     *         we append "_X_POS_<pos>" to the post op type.
-     *      E.g. for a single post op add(dst, x), where dst is the result of the main op.
-     *         In this case, the position of the previous op's dest is 0, so we pass
-     *         -P1_ELTWISE_OP=ADD_X_POS_0
-     *
-     * @param[out] built_opts OpenCL kernel build options
-     * @param[in]  post_ops   Post ops
-     *
-     */
-    void set_post_ops_cl_build_options(CLBuildOptions &built_opts, const PostOpList<ITensorInfo *> &post_ops) const;
-    /** Helper function to set PostOp kernel name
-     *
-     * @param[out] kernel_name OpenCL kernel name
-     * @param[in]  post_ops    Post ops
-     *
-     */
-    void set_post_ops_cl_kernel_name(std::string &kernel_name, const PostOpList<ITensorInfo *> &post_ops) const;
-
-private:
-    Config _supported_config{};
-};
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
-} // namespace experimental
-
-} // arm_compute
-
-#endif /* ARM_COMPUTE_CL_CLUTILS_H */
+#endif // ACL_SRC_CORE_CL_CLUTILS_H

diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
deleted file mode 100644
index 2c2d60e..0000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h
+++ /dev/null

@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Post Op expansions for the post op sequence:
- * act (optional): POST_OP1_ACTIVATION_OPTIONAL
- * eltwise_op   : POST_OP2_ELTWISE_OP
- * act (optional): POST_OP3_ACTIVATION_OPTIONAL
- */
-
-/** Post Op 1: Activation Block (Optional)
- * @name POST_OP1_ACTIVATION_OPTIONAL
- * Toggled by -DP1_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
-    MIXED_PRECISION_ACTIVATION_BLOCK(N, P1_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else                                                                                         // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-#define POST_OP1_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif                                                                                        // defined(P1_ACTIVATION_TYPE) && defined(P1_ACTIVATION_A_VAL) && defined(P1_ACTIVATION_B_VAL)
-/** @} */                                                                                     // end of group POST_OP1_ACTIVATION_OPTIONAL
-
-/** Post Op 2: Eltwise Op Block
- * Handles both broadcasting and non-broadcasting cases
- * @name POST_OP2_ELTWISE_OP
- *
- * @param[in] P2_ELTWISE_ARG1_HEIGHT Height (number of rows) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] P2_ELTWISE_ARG1_WIDTH  Width (number of columns) of the @ref ELTWISE_OPERAND_NAME tensor
- * @param[in] OP                     The elementwise post op
- * @param[in] M0                     The number of consecutive rows
- * @param[in] N0                     The number of consecutive columns
- * @param[in] BASENAME               The basename of the result variables
- * @param[in] ELTWISE_OPERAND_NAME   The basename of the other operand variables
- * @param[in] ELTWISE_OPERAND_ROW    The starting row of the other operand variables. Required as different boundary handling strategies are used by different kernels
- *                                   E.g. reshaped_only_rhs and native kernels shifts rows (by using COMPUTE_M0_START_ROW) to handle boundary rows,
- *                                   whereas reshaped kernels do not shift rows
- * @param[in] DATA_TYPE              Data type of the result variables
- * @param[in] DATA_TYPE_ACCUMULATR   Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] ZERO                   Zero vector for z offset
- * @param[in] PARTIAL_LOAD_M0        The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @param[in] PARTIAL_LOAD_N0        The partial size in x, for partial blocks. Supported: [0, @p N0)
- * @param[in] PARTIAL_COND_Y         Condition on the y axis to perform the partial load Y. True to use PARTIAL_LOAD_M0 rather than M0.
- * @param[in] PARTIAL_COND_X         Condition on the x axis to perform the partial load X. True to use PARTIAL_LOAD_N0 rather than N0.
- * @{
- */
-#if defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#if P2_ELTWISE_ARG1_HEIGHT == 1
-#if P2_ELTWISE_ARG1_WIDTH == 1 // Case 1: Broadcasting in both X and Y; op2 arg tile shape[YxX] == [1x1]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z;              \
-    VEC_DATA_TYPE(DATA_TYPE, 1)                                                                                                                                                                        \
-    ELTWISE_OPERAND_NAME##0 = VLOAD(1)(0, (__global DATA_TYPE *)ELTWISE_OPERAND_NAME##_addr);                                                                                                          \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, 1, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#else // P2_ELTWISE_ARG1_WIDTH == 1; Case 2: Broadcasting in only Y; op2 arg tile shape[YxX] == [1xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                        \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, 1, PARTIAL_LOAD_N0, false, PARTIAL_COND_X);                                                      \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif // P2_ELTWISE_ARG1_WIDTH == 1
-#else  // P2_ELTWISE_ARG1_HEIGHT == 1; Case 3: No broadcasting; op2 arg tile shape[YxX] == [M0xN0]
-#define POST_OP2_ELTWISE_OP(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_ROW, DATA_TYPE, DATA_TYPE_ACCUMULATOR, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                                                                                                  \
-    __global uchar *ELTWISE_OPERAND_NAME##_addr = ELTWISE_OPERAND_NAME##_ptr + ELTWISE_OPERAND_NAME##_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (ELTWISE_OPERAND_ROW * ELTWISE_OPERAND_NAME##_stride_y) + get_global_id(2) * ELTWISE_OPERAND_NAME##_stride_z; \
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, ELTWISE_OPERAND_NAME, ELTWISE_OPERAND_NAME##_addr, 0, ELTWISE_OPERAND_NAME##_stride_y, ZERO, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X);                                                                                        \
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, BASENAME, ELTWISE_OPERAND_NAME, DATA_TYPE_ACCUMULATOR, ELTWISE_OPERAND_NAME##_hp);
-#endif    // P2_ELTWISE_ARG1_HEIGHT == 1
-#endif    // defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-/** @} */ // end of group POST_OP2_ELTWISE_OP
-/** Post Op 3: Activation Block (Optional)
- * @name POST_OP3_ACTIVATION_OPTIONAL
- * Toggled by -DP3_ACTIVATION_TYPE
- * params: same as those in @ref MIXED_PRECISION_ACTIVATION_BLOCK
- * @{
- */
-#if defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) \
-    MIXED_PRECISION_ACTIVATION_BLOCK(N, P3_ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, DATA_TYPE_ACCUMULATOR);
-#else                                                                                         // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-#define POST_OP3_ACTIVATION_OPTIONAL(N, DATA_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME) // noop
-#endif                                                                                        // defined(P3_ACTIVATION_TYPE) && defined(P3_ACTIVATION_A_VAL) && defined(P3_ACTIVATION_B_VAL)
-/** @} */                                                                                     // end of group POST_OP3_ACTIVATION_OPTIONAL

diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
deleted file mode 100644
index 22ae098..0000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
+++ /dev/null

@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h"
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h"
-
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_native, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                     IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                     IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                     IMAGE_DECLARATION(dst),
-                                                     // Post Op arguments
-                                                     IMAGE_DECLARATION(eltwise_operand),
-                                                     uint lhs_stride_z,
-                                                     uint rhs_stride_z,
-#if defined(BETA)
-                                                     uint bias_stride_z,
-#endif //defined(BETA)
-                                                     uint      dst_stride_z,
-                                                     uint      eltwise_operand_stride_z,
-                                                     const int M,
-                                                     const int N,
-                                                     const int K
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                     ,
-                                                     uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                     ,
-                                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                    )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-#if K0 > 1
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
-        RHS_VFMA_M0xN0(0, a, b0, c);
-        RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
-        RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
-        RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
-        RHS_VFMA_M0xN0(4, a, b4, c);
-        RHS_VFMA_M0xN0(5, a, b5, c);
-        RHS_VFMA_M0xN0(6, a, b6, c);
-        RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
-        RHS_VFMA_M0xN0(8, a, b8, c);
-        RHS_VFMA_M0xN0(9, a, b9, c);
-        RHS_VFMA_M0xN0(A, a, bA, c);
-        RHS_VFMA_M0xN0(B, a, bB, c);
-        RHS_VFMA_M0xN0(C, a, bC, c);
-        RHS_VFMA_M0xN0(D, a, bD, c);
-        RHS_VFMA_M0xN0(E, a, bE, c);
-        RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * rhs_stride_y;
-    }
-#endif // K0 > 1
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
-        RHS_VFMA_M0xN0(0, a, b, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(GEMM_MM_NATIVE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)

diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
deleted file mode 100644
index 89577e9..0000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ /dev/null

@@ -1,1424 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped kernel */
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-        c += a.s8 * b.s8;   \
-        c += a.s9 * b.s9;   \
-        c += a.sA * b.sA;   \
-        c += a.sB * b.sB;   \
-        c += a.sC * b.sC;   \
-        c += a.sD * b.sD;   \
-        c += a.sE * b.sE;   \
-        c += a.sF * b.sF;   \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else  // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-        c = fma(a.s8, b.s8, c); \
-        c = fma(a.s9, b.s9, c); \
-        c = fma(a.sA, b.sA, c); \
-        c = fma(a.sB, b.sB, c); \
-        c = fma(a.sC, b.sC, c); \
-        c = fma(a.sD, b.sD, c); \
-        c = fma(a.sE, b.sE, c); \
-        c = fma(a.sF, b.sF, c); \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-        ARM_DOT_K0((a), (b##8), (c.s8)); \
-        ARM_DOT_K0((a), (b##9), (c.s9)); \
-        ARM_DOT_K0((a), (b##A), (c.sA)); \
-        ARM_DOT_K0((a), (b##B), (c.sB)); \
-        ARM_DOT_K0((a), (b##C), (c.sC)); \
-        ARM_DOT_K0((a), (b##D), (c.sD)); \
-        ARM_DOT_K0((a), (b##E), (c.sE)); \
-        ARM_DOT_K0((a), (b##F), (c.sF)); \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                    IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                    IMAGE_DECLARATION(dst),
-                                                                    // Post Op arguments
-                                                                    IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint lhs_stride_z,
-                                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                                    uint dst_stride_z,
-                                                                    uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                    ,
-                                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                    ,
-                                                                    const int M,
-                                                                    const int N,
-                                                                    const int K)
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_nt_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                            __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            // Post Op arguments
-                                                                            IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-#if defined(BETA)
-                                                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                                                            uint dst_stride_z,
-                                                                            uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                            ,
-                                                                            const int M,
-                                                                            const int N,
-                                                                            const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
-    ({                                                 \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
-    })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
-    })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
-    })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
-    })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
-    })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
-    ({                                                        \
-        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
-    })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
-    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
-    (M0, N0, TYPE, A, B, C)
-
-#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                    IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                    IMAGE_DECLARATION(dst),
-                                                                    // Post Op arguments
-                                                                    IMAGE_DECLARATION(eltwise_operand),
-                                                                    uint lhs_stride_z,
-                                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                                    uint dst_stride_z,
-                                                                    uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                    ,
-                                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                    ,
-                                                                    const int M,
-                                                                    const int N,
-                                                                    const int K)
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
-        rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_lhs_t_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                            __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                            IMAGE_DECLARATION(dst),
-                                                                            // Post Op arguments
-                                                                            IMAGE_DECLARATION(eltwise_operand),
-                                                                            uint lhs_stride_z,
-                                                                            uint rhs_stride_z,
-#if defined(BETA)
-                                                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                                                            uint dst_stride_z,
-                                                                            uint eltwise_operand_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                            ,
-                                                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                            ,
-                                                                            const int M,
-                                                                            const int N,
-                                                                            const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-        x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
-        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-    // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD, M0, N0, c, bias, DATA_TYPE_ACCUMULATOR, bias_hp);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, get_global_id(1) * (uint)M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x, c_lp);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#endif // defined(LHS_TRANSPOSE)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)

diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
deleted file mode 100644
index 09ddcde..0000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ /dev/null

@@ -1,1399 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "fp_post_ops_act_eltwise_op_act.h"
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
-#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-#define ARM_DOT2(a, b, c)       \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#define ARM_DOT3(a, b, c)           \
-    ({                              \
-        ARM_DOT2(a, b, c);          \
-        c = fma((a.s2), (b.s2), c); \
-    })
-#define ARM_DOT4(a, b, c)           \
-    ({                              \
-        ARM_DOT3(a, b, c);          \
-        c = fma((a.s3), (b.s3), c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##8), (c.s8));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##9), (c.s9));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##A), (c.sA));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##B), (c.sB));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##C), (c.sC));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##D), (c.sD));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##E), (c.sE));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##F), (c.sF));     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                  IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                  IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                  IMAGE_DECLARATION(dst),
-                                                                  // Post Op arguments
-                                                                  IMAGE_DECLARATION(eltwise_operand),
-                                                                  uint lhs_stride_z,
-                                                                  uint rhs_stride_z,
-#if defined(BETA)
-                                                                  uint bias_stride_z,
-#endif //defined(BETA)
-                                                                  uint dst_stride_z,
-                                                                  uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                  ,
-                                                                  uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                  ,
-                                                                  uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                  ,
-                                                                  const int M,
-                                                                  const int N,
-                                                                  const int K)
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                          __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                          IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                          IMAGE_DECLARATION(dst),
-                                                                          // Post Op arguments
-                                                                          IMAGE_DECLARATION(eltwise_operand),
-                                                                          uint lhs_stride_z,
-                                                                          uint rhs_stride_z,
-#if defined(BETA)
-                                                                          uint bias_stride_z,
-#endif //defined(BETA)
-                                                                          uint dst_stride_z,
-                                                                          uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                          ,
-                                                                          uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                          ,
-                                                                          uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                          ,
-                                                                          const int M,
-                                                                          const int N,
-                                                                          const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    const uint LEFTOVER_K = K % K0;
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    if(LEFTOVER_K != 0)
-    {
-        // Note: We cannot read out-of-bound elements from the RHS matrix because
-        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-        union UNION_VEC_TYPE
-        {
-            DATA_TYPE s[K0];
-            VEC_DATA_TYPE(DATA_TYPE, K0)
-            v;
-        };
-
-        union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
-        union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
-        union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
-        union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
-        union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
-        union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
-        union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
-        union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
-        // Load from RHS matrix
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Load from LHS matrix
-        for(int k = 0; k < LEFTOVER_K; ++k)
-        {
-            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
-            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
-            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
-            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
-            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
-            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
-            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
-            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
-            lhs_offset += sizeof(DATA_TYPE);
-        }
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                   IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                                                   IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                   IMAGE_DECLARATION(dst),
-                                                                   // Post Op arguments
-                                                                   IMAGE_DECLARATION(eltwise_operand),
-                                                                   uint lhs_stride_z,
-                                                                   uint rhs_stride_z,
-#if defined(BETA)
-                                                                   uint bias_stride_z,
-#endif //defined(BETA)
-                                                                   uint dst_stride_z,
-                                                                   uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                   ,
-                                                                   uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                   ,
-                                                                   uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                   ,
-                                                                   const int M,
-                                                                   const int N,
-                                                                   const int K)
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-
-#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
- * Post op 1: activation (optional)
- * Post op 2: elementwise op
- * Post op 3: activation (optional)
- *
- * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
- * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
- * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
- *
- * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions:
- *
- * @param[in] eltwise_operand_ptr      Pointer to the eltwise operand matrix. Supported data type: F16/F32
- * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
- * @param[in] eltwise_operand_step_x   eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
- * @param[in] eltwise_operand_step_y   eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
- * @param[in] M                        Number of rows in LHS matrix not reshaped.
- * @param[in] N                        Number of columns in RHS matrix not reshaped.
- * @param[in] K                        Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
-                                                                           __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                                           IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                                           IMAGE_DECLARATION(dst),
-                                                                           // Post Op arguments
-                                                                           IMAGE_DECLARATION(eltwise_operand),
-                                                                           uint lhs_stride_z,
-                                                                           uint rhs_stride_z,
-#if defined(BETA)
-                                                                           uint bias_stride_z,
-#endif //defined(BETA)
-                                                                           uint dst_stride_z,
-                                                                           uint eltwise_operand_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                                           ,
-                                                                           uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                                           ,
-                                                                           uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                                           ,
-                                                                           const int M,
-                                                                           const int N,
-                                                                           const int K)
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        x_rhs += RHS_STEP_X;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    // c = act(c)
-    POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-    // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
-    POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x);
-    // c = act(c)
-    POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
-#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)

diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
deleted file mode 100644
index b584251..0000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h
+++ /dev/null

@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** (EXPERIMENTAL_POST_OPS) Macros for (binary) elementwise operations */
-
-/** List of (binary) elementwise operators, accounting for the argument position of argument X
- * @note X_Pos denotes the position of argument X. e.g. X_POS_0 means X is in the first place whereas X_POS_1 means X is in the second place
- * @name elementwise_post_ops
- * @{
- */
-#if defined(N0) && !defined(VEC_SIZE)
-#define VEC_SIZE N0
-#endif // defined(N0) && !defined(VEC_SIZE)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
-#define ADD_X_POS_0(x, y) (x) + (y)
-#define SUB_X_POS_0(x, y) (x) - (y)
-#define MAX_X_POS_0(x, y) max(x, y)
-#define MIN_X_POS_0(x, y) min(x, y)
-#define SQUARED_DIFF_X_POS_0(x, y) (x - y) * (x - y)
-#define POWER_X_POS_0(x, y) pow(x, y)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_0(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_0(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_0(x, y) (x / y)
-#define AND_X_POS_0(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-#define OR_X_POS_0(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))1))
-
-#define ADD_X_POS_1(x, y) ADD_X_POS_0(x, y)
-#define SUB_X_POS_1(x, y) (y) - (x)
-#define MAX_X_POS_1(x, y) MAX_X_POS_0(x, y)
-#define MIN_X_POS_1(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF_X_POS_1(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER_X_POS_1(x, y) pow(y, x)
-#if VEC_SIZE == 1
-#define PRELU_X_POS_1(x, y) (y > 0 ? y : y * x)
-#else // VEC_SIZE == 1
-
-#if defined(MIXED_PRECISION)
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE_ACCUMULATOR)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, VEC_SIZE))))
-#else // MIXED_PRECISION
-#define PRELU_X_POS_1(x, y) (select(x * y, y, CONVERT((y > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))))
-#endif // MIXED_PRECISION
-
-#endif // VEC_SIZE == 1
-#define DIV_X_POS_1(x, y) (y / x)
-#define AND_X_POS_1(x, y) AND_X_POS_0(x, y)
-#define OR_X_POS_1(x, y) OR_X_POS_0(x, y)
-
-// By default use the order of the arguments as they are passed in, ie. _X_POS_0
-#define ADD(x, y) ADD_X_POS_0(x, y)
-#define SUB(x, y) SUB_X_POS_0(x, y)
-#define MAX(x, y) MAX_X_POS_0(x, y)
-#define MIN(x, y) MIN_X_POS_0(x, y)
-#define SQUARED_DIFF(x, y) SQUARED_DIFF_X_POS_0(x, y)
-#define POWER(x, y) POWER_X_POS_0(x, y)
-#define PRELU(x, y) PRELU_X_POS_0(x, y)
-#define DIV(x, y) DIV_X_POS_0(x, y)
-#define AND(x, y) AND_X_POS_0(x, y)
-#define OR(x, y) OR_X_POS_0(x, y)
-
-#endif    // defined(VEC_SIZE) && defined(DATA_TYPE)
-/** @} */ // end of group elementwise_post_ops
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_ROW_n
- *
- * @param[in]      OP       The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in]      OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2) \
-    OPERAND1##0 = OP(OPERAND1##0, OPERAND2##0);
-
-#define ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_1(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##1 = OP(OPERAND1##1, OPERAND2##1);
-
-#define ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_2(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##2 = OP(OPERAND1##2, OPERAND2##2);
-
-#define ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_3(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##3 = OP(OPERAND1##3, OPERAND2##3);
-
-#define ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_4(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##4 = OP(OPERAND1##4, OPERAND2##4);
-
-#define ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_5(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##5 = OP(OPERAND1##5, OPERAND2##5);
-
-#define ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_6(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##6 = OP(OPERAND1##6, OPERAND2##6);
-
-#define ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_7(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##7 = OP(OPERAND1##7, OPERAND2##7);
-
-#define ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_8(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##8 = OP(OPERAND1##8, OPERAND2##8);
-
-#define ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_9(OP, OPERAND1, OPERAND2)      \
-    OPERAND1##9 = OP(OPERAND1##9, OPERAND2##9);
-
-#define ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_10(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##A = OP(OPERAND1##A, OPERAND2##A);
-
-#define ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_11(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##B = OP(OPERAND1##B, OPERAND2##B);
-
-#define ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_12(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##C = OP(OPERAND1##C, OPERAND2##C);
-
-#define ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_13(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##D = OP(OPERAND1##D, OPERAND2##D);
-
-#define ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_14(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##E = OP(OPERAND1##E, OPERAND2##E);
-
-#define ELTWISE_OP_ROW_16(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_15(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##F = OP(OPERAND1##F, OPERAND2##F);
-
-/** @} */ // end of group ELTWISE_OP_ROW_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name ELTWISE_OP_BLOCK
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP       The elementwise post op
- * @param[in] N        The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_ROW_BROADCAST_n
- *
- * @param[in]      OP       The elementwise post op
- * @param[in, out] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in]      OPERAND2 The basename of the broadcast operand 2 variables
- * @{
- */
-#define ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2) \
-    OPERAND1##0 = OP(OPERAND1##0, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_1(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##1 = OP(OPERAND1##1, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_2(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##2 = OP(OPERAND1##2, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_3(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##3 = OP(OPERAND1##3, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_4(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##4 = OP(OPERAND1##4, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_5(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##5 = OP(OPERAND1##5, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_6(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##6 = OP(OPERAND1##6, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_7(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##7 = OP(OPERAND1##7, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_8(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##8 = OP(OPERAND1##8, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_9(OP, OPERAND1, OPERAND2)      \
-    OPERAND1##9 = OP(OPERAND1##9, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_10(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##A = OP(OPERAND1##A, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_11(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##B = OP(OPERAND1##B, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_12(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##C = OP(OPERAND1##C, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_13(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##D = OP(OPERAND1##D, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_14(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##E = OP(OPERAND1##E, OPERAND2);
-
-#define ELTWISE_OP_ROW_BROADCAST_16(OP, OPERAND1, OPERAND2) \
-    ELTWISE_OP_ROW_BROADCAST_15(OP, OPERAND1, OPERAND2)     \
-    OPERAND1##F = OP(OPERAND1##F, OPERAND2);
-
-/** @} */ // end of group ELTWISE_OP_ROW_BROADCAST_n
-
-/** Performs OPERAND1 = OP(OPERAND1, OPERAND2) with broadcasting
- * @name ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX];
- *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar);
- *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1];
- *
- * Supported cases are N=1,2,3,...,16
- *
- * @param[in] OP       The elementwise post op
- * @param[in] N        The number of vectors in the block
- * @param[in] OPERAND1 The basename of the destination and operand 1 variables
- * @param[in] OPERAND2 The basename of the operand 2 variables
- * @{
- */
-#define ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_ROW_BROADCAST_##N(OP, OPERAND1, OPERAND2)
-#define ELTWISE_OP_BLOCK_BROADCAST(OP, N, OPERAND1, OPERAND2) ELTWISE_OP_BLOCK_BROADCAST_STR(OP, N, OPERAND1, OPERAND2)
-/** @} */ // end of group ELTWISE_OP_BLOCK_BROADCAST
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
deleted file mode 100644
index e107f44..0000000
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h
+++ /dev/null

@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h"
-#include "gemm_helpers.h"
-#include "load_store_utility.h"
-
-/** (EXPERIMENTAL_POST_OPS) Convenience macros for automatically handling mixed precision (fp16 and fp32) operations
- * -DMIXED_PRECISION toggles mixed precision mode
- */
-
-/** Mixed-Precision-Aware Activation Block
- * @name MIXED_PRECISION_ACTIVATION_BLOCK
- * params N ... B_VAL: same as those in @ref ACTIVATION_BLOCK
- *
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
-    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL, DATA_TYPE_ACCUMULATOR) \
-    ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ACTIVATION_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK
- *
- * @param[in] OP                   The elementwise post op
- * @param[in] M0                   The number of consecutive rows
- * @param[in] N0                   The number of consecutive columns
- * @param[in] OPERAND1             The basename of the first and result operand variables
- * @param[in] OPERAND2             The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                     \
-    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, CONVERTED_OPERAND2);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    ELTWISE_OP_BLOCK(OP, M0, OPERAND1, OPERAND2);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK
-
-/** Mixed-Precision-Aware Elementwise Op Broadcast Block
- * Performs OPERAND1 = OP(OPERAND1, OPERAND2)
- * @name MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
- * @note Only support:
- *      case 1 broadcast in Y dimension : Operand1 [YxX] + Operand2 [1xX]; this means @p N0 > 1
- *      case 2 broadcast in both Y and X dimensions : Operand1 [YxX] + Operand2 [1x1] (scalar) ; this means @p N0 == 1
- *      Does NOT support broad cast in X dimension: Operand1 [YxX] + Operand2 [Yx1]; this means @p M0 should never == 1
- *
- * @param[in] OP                   The elementwise post op
- * @param[in] M0                   The number of consecutive rows, > 1
- * @param[in] N0                   The number of consecutive columns, >= 1
- * @param[in] OPERAND1             The basename of the first and result operand variables
- * @param[in] OPERAND2             The basename of the second operand variables
- * @param[in] DATA_TYPE_ACCUMULATR Higher-precision accumulator data type in case of mixed-precision op
- * @param[in] CONVERTED_OPERAND2   The basename of the second operand variables converted to higher-precision in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, OPERAND2, CONVERTED_OPERAND2);                                                \
-    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, CONVERTED_OPERAND2##0);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST(OP, M0, N0, OPERAND1, OPERAND2, DATA_TYPE_ACCUMULATOR, CONVERTED_OPERAND2) \
-    ELTWISE_OP_BLOCK_BROADCAST(OP, M0, OPERAND1, OPERAND2##0);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_ELTWISE_OP_BLOCK_BROADCAST
-
-/** Mixed-Precision-Aware Boundary-Aware Store Block
- * @name MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
- * params M0 ... PARTIAL_COND_X, same as those in STORE_BLOCK_BOUNDARY_AWARE
- *
- * @param[in] BASENAME_LP The name of the low precision variables, converted from BASENAME, in case of mixed-precision op
- * @{
- */
-#if defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, BASENAME, BASENAME_LP);                                                                                                                       \
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME_LP, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#else // defined(MIXED_PRECISION)
-#define MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X, BASENAME_LP) \
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X);
-#endif    // defined(MIXED_PRECISION)
-/** @} */ // end of group MIXED_PRECISION_STORE_BLOCK_BOUNDARY_AWARE
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index a32301d..0c30c0e 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl

@@ -152,7 +152,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -453,7 +452,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -887,7 +885,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
@@ -1213,7 +1210,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -1713,7 +1709,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
@@ -1993,7 +1988,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -2380,7 +2374,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -2767,7 +2760,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
  *
  * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
@@ -3226,7 +3218,6 @@
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
  *  The RHS matrix is NOT reshaped
- * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
  *
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
  * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.

diff --git a/src/core/experimental/PostOpUtils.h b/src/core/experimental/PostOpUtils.h
deleted file mode 100644
index 6217dcc..0000000
--- a/src/core/experimental/PostOpUtils.h
+++ /dev/null

@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021, 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-#define ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
-
-#include "arm_compute/core/experimental/Types.h"
-#include "support/Cast.h"
-
-#include <vector>
-
-/** (EXPERIMENTAL_POST_OPS) */
-namespace arm_compute
-{
-namespace experimental
-{
-/** Transform a PostOpList of type FromTensorT to one of type ToTensorT */
-template <typename FromTensorT, typename ToTensorT>
-PostOpList<ToTensorT> transform_post_op_list_arguments(const PostOpList<FromTensorT> &post_ops, std::function<ToTensorT(FromTensorT)> transform_arg)
-{
-    PostOpList<ToTensorT> transformed_post_ops;
-    for(const auto &post_op : post_ops.get_list())
-    {
-        switch(post_op->type())
-        {
-            case PostOpType::Activation:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpAct<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpAct<ToTensorT>>(_post_op->_act_info);
-                break;
-            }
-            case PostOpType::Eltwise_Add:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwiseAdd<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpEltwiseAdd<ToTensorT>>(transform_arg(_post_op->_addend), _post_op->_prev_dst_pos, _post_op->_policy);
-                break;
-            }
-            case PostOpType::Eltwise_PRelu:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const PostOpEltwisePRelu<FromTensorT> *>(post_op.get());
-                transformed_post_ops.template push_back_op<PostOpEltwisePRelu<ToTensorT>>(transform_arg(_post_op->_alpha_param), _post_op->_prev_dst_pos, _post_op->_policy);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-    return transformed_post_ops;
-}
-
-/** Get post op argument TensorType from post op argument index in a flattened, ordered post op argument list */
-inline TensorType get_post_op_arg_type(size_t index)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int>(index) > EXPERIMENTAL_ACL_POST_OP_ARG_LAST - EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, "Post Op argument index is out of range");
-    return static_cast<TensorType>(EXPERIMENTAL_ACL_POST_OP_ARG_FIRST + static_cast<int>(index));
-}
-
-/** Get a sequence of PostOp Types from PostOpList */
-template <typename T>
-PostOpTypeSequence get_post_op_sequence(const PostOpList<T> &post_ops)
-{
-    PostOpTypeSequence post_op_sequence;
-    for(const auto &op : post_ops.get_list())
-    {
-        post_op_sequence.push_back(op->type());
-    }
-    return post_op_sequence;
-}
-
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_POSTOPUTILS

diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index d11e4f0..39b410d 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp

@@ -107,7 +107,7 @@
     // Create GEMMInfo structure
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
     // Supported activations in GEMM
     const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
@@ -156,8 +156,8 @@
         quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info,
-                                                                              experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format));
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
+                                                                              weight_format));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
         for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -188,7 +188,7 @@
     // Create GEMMInfo structure
     const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weight_format);
+                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
     if(is_quantized)
     {
@@ -422,7 +422,7 @@
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
     const GEMMInfo     gemm_info     = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                                 gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, experimental::PostOpList<ITensorInfo *>(), fixed_format, weights_info.weight_format());
+                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
 
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }

diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index de2e9f9..e4a3d30 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp

@@ -275,23 +275,14 @@
     { "gemm_mm_native", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" },
     { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
     { "gemm_lc_vm_f32", "common/gemm.cl" },
     { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
     { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
@@ -623,26 +614,6 @@
 #include "./cl_kernels/common/gemm_utils.clembed"
     },
     {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.hembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.hembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
-    },
-    {
-        "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
-#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
-    },
-    {
         "common/gemmlowp.cl",
 #include "./cl_kernels/common/gemmlowp.clembed"
     },

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index 5fea097..b8997df 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -52,25 +51,6 @@
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
@@ -90,7 +70,6 @@
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -133,7 +112,6 @@
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -240,7 +218,6 @@
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -298,20 +275,11 @@
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_native");
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -396,11 +364,11 @@
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7;
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6;
         }
         const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -412,11 +380,11 @@
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
         }
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -440,12 +408,7 @@
             add_2D_tensor_argument(idx, src2, slice);
         }
         add_2D_tensor_argument(idx, dst, slice);
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
+
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         if(_add_bias)
@@ -453,12 +416,6 @@
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
 
         // Pass m, n and k at runtime
         _kernel.setArg<cl_int>(idx++, _m);

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index e478df7..80f8355 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/common/Macros.h"
@@ -76,17 +76,16 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_input_as_3d{ false };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{ true };
+    bool       _reinterpret_input_as_3d{ false };
+    bool       _reinterpret_output_as_3d{ false };
+    bool       _use_dummy_work_items{ false };
+    bool       _add_bias{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index f14a6f1..d72d29e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,11 +30,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -53,25 +52,6 @@
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
@@ -95,7 +75,6 @@
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -139,7 +118,6 @@
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -202,7 +180,6 @@
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_src0 = src0->num_dimensions();
@@ -260,23 +237,14 @@
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
     kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -395,13 +363,6 @@
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
 
@@ -417,12 +378,6 @@
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
         if(_reinterpret_output_as_3d)
         {

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 2d668b9..8d25412 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
@@ -100,17 +100,16 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{ true };
+    bool       _reinterpret_output_as_3d{ false };
+    bool       _use_dummy_work_items{ false };
+    bool       _add_bias{ false };
+    bool       _export_to_cl_image{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
\ No newline at end of file
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index f780538..b34c17c 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp

@@ -23,13 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -47,25 +46,6 @@
 {
 using ElementsProcessed = Steps;
 
-const auto post_op_utils = experimental::PostOpCLKernelUtils(
-{
-    //  PostOp sequence                   -> {Kernel Postfix, PostOp Slots}
-    { {}, { "", {} } },
-    { { experimental::PostOpType::Activation }, { "", { 1 } } },
-
-    { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
-    { { experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 2 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu }, { "_post_act_eltwise_op_act", { 1, 2 } } },
-
-    { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-    { { experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
-
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } },
-    { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_PRelu, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
-});
-
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
 {
@@ -86,7 +66,6 @@
                                     "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -132,7 +111,6 @@
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
     }
 
     return Status{};
@@ -203,7 +181,6 @@
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _has_pad_y                = gemm_info.has_pad_y;
-    _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     auto padding_info = get_padding_info({ src0, src1, src2, dst });
 
@@ -270,22 +247,14 @@
         build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
         build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     }
-    // If post_ops are used, then we disable the use of gemm_info.activation_info
-    if(gemm_info.post_ops.size() > 0)
-    {
-        post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
-    }
-    else
-    {
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-        build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    }
+
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-    post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -411,13 +380,6 @@
         // dst buffer
         add_2D_tensor_argument(idx, dst, slice);
 
-        // post op argument buffers
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            add_2D_tensor_argument(idx, post_op_arg, slice);
-        }
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
 
@@ -432,12 +394,6 @@
 
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
-        // post op argument stride_z
-        for(size_t i = 0; i < _num_post_op_args; ++i)
-        {
-            const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
-        }
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
         if(_reinterpret_input_as_3d && _has_pad_y)

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 00cdb29..471160c 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
@@ -90,19 +90,18 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_input_as_3d{ false };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    bool         _has_pad_y{ false };
-    signed int   _m{ 1 };
-    signed int   _n{ 1 };
-    signed int   _k{ 1 };
-    unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
+    bool       _slide_matrix_b{ true };
+    bool       _reinterpret_input_as_3d{ false };
+    bool       _reinterpret_output_as_3d{ false };
+    bool       _use_dummy_work_items{ false };
+    bool       _add_bias{ false };
+    bool       _export_to_cl_image{ false };
+    bool       _has_pad_y{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H

diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index 51248d4..eb9475c 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,6 @@
         case ConvolutionMethod::WINOGRAD:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClWinogradConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
             _operator = std::move(f);
@@ -99,7 +98,6 @@
         case ConvolutionMethod::DIRECT:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClDirectConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
             _operator = std::move(f);
@@ -108,7 +106,6 @@
         case ConvolutionMethod::INDIRECT:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            ARM_COMPUTE_ERROR_ON(conv2d_info.post_ops.size() > 0);
             auto f = std::make_unique<ClIndirectConv2d>();
             f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
             _operator = std::move(f);
@@ -142,7 +139,6 @@
         {
             //Validate Winograd
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClWinogradConv2d does not support PostOps");
             ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
             break;
         }
@@ -150,7 +146,6 @@
         {
             // Validate direct convolution layer
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClDirectConv2d does not support PostOps");
             ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
@@ -158,7 +153,6 @@
         {
             // Validate indirect convolution layer
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClIndirectConv2d does not support PostOps");
             ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
@@ -271,17 +265,17 @@
             if(is_data_type_float(src->data_type()))
             {
                 // Get dst shape
-                TensorShape output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-                const bool  is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool  is_ifm_ge_8        = src->dimension(idx_c) >= 8;
-                const bool  is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool  is_ofm_lte_8       = weights->dimension(3U) <= 8;
-                const bool  is_ofm_lt_64       = weights->dimension(3U) < 64;
-                const bool  workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
-                const bool  is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-                const bool  is_m_one           = output_shape[1] * output_shape[2] == 1;
-                const bool  is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
-                const int32_t kernel_sz        = weights->dimension(idx_w) * weights->dimension(idx_h);
+                TensorShape   output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool    is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool    is_ifm_ge_8        = src->dimension(idx_c) >= 8;
+                const bool    is_ifm_ge_16       = src->dimension(idx_c) >= 16;
+                const bool    is_ofm_lte_8       = weights->dimension(3U) <= 8;
+                const bool    is_ofm_lt_64       = weights->dimension(3U) < 64;
+                const bool    workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool    is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
+                const bool    is_m_one           = output_shape[1] * output_shape[2] == 1;
+                const bool    is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz          = weights->dimension(idx_w) * weights->dimension(idx_h);
 
                 // Run Winograd if valid and IFM >= 8
                 if(is_wino_valid && is_ifm_ge_8)
@@ -330,7 +324,7 @@
                         {
                             const bool is_kernel_sz_odd = kernel_sz % 2;
                             const bool is_g77           = gpu_target == GPUTarget::G77;
-                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+                            preferred_conv_method       = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
                         }
 
                         // Direct/indirect convolution used for the first layer of the network

diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 8db6dab..7e331a8 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp

@@ -38,7 +38,6 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -222,7 +221,6 @@
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
@@ -254,7 +252,6 @@
     kernel_info.reinterpret_input_as_3d = false;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _reshape_lhs_kernel->set_target(gpu_target);
@@ -299,7 +296,6 @@
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
@@ -346,7 +342,6 @@
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
@@ -396,7 +391,6 @@
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
 
@@ -433,7 +427,6 @@
     kernel_info.reinterpret_input_as_3d = false;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -482,7 +475,6 @@
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -531,7 +523,6 @@
     kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
-    kernel_info.post_ops                = gemm_info.post_ops();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -624,7 +615,12 @@
     // Select GEMMType
     CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
     {
-        CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
+        CLScheduler::get().target(),
+        a->data_type(),
+        m,
+        n,
+        k,
+        batch_size,
     },
     gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
 

diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 682477e..5620471 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,14 +54,14 @@
 {
 ClGemmConv2d::ClGemmConv2d()
     : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _use_post_ops(false), _aux_mem(AuxTensorIdx::Count)
+      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
 {
 }
 ClGemmConv2d::~ClGemmConv2d() = default;
 
 void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                int gemm_3d_depth, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
     ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
@@ -76,14 +76,12 @@
                                          false,                 // fast_math
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
-                                         act_info,              // activation_info
-                                         post_ops               // post ops
+                                         act_info               // activation_info
                                         );
 
     TensorInfo tmp_src{ *src };
     if(_is_quantized)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
@@ -118,7 +116,7 @@
 }
 
 Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
 
@@ -132,13 +130,11 @@
                                          false,                 // fast_math
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
-                                         act_info,              // activation_info
-                                         post_ops               // post ops
+                                         act_info               // activation_info
                                         );
 
     if(is_quantized)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "ClGemmConv2d quantized types do not support post ops");
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
@@ -189,19 +185,18 @@
 
     // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
     _fuse_activation = true;
-    _use_post_ops    = conv2d_info.post_ops.size() > 0;
 
     const ITensorInfo *gemm_input_to_use  = src;
     ITensorInfo       *gemm_output_to_use = dst;
 
     // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
+    unsigned int stride_x        = 0;
+    unsigned int stride_y        = 0;
     std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
 
     // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
+    unsigned int conv_w      = 0;
+    unsigned int conv_h      = 0;
     std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
                                                  src->dimension(idx_height),
                                                  kernel_width,
@@ -318,11 +313,10 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info, conv2d_info.post_ops);
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
 
     if(!_skip_col2im)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(conv2d_info.post_ops.size() > 0, "ClGemmConv2d does not support post ops with col2im operation"); // Post ops must be performed after every other op
         // Set the GPU target for col2im
         _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
         _col2im_kernel->set_target(CLScheduler::get().target());
@@ -334,8 +328,7 @@
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    // Disable running of activation kernel if post ops are used
-    if(!_fuse_activation && !_use_post_ops)
+    if(!_fuse_activation)
     {
         _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
         _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
@@ -383,15 +376,11 @@
     const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
     const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
                                              && conv2d_info.conv_info.stride().second == 1);
-    const bool skip_col2im     = data_layout == DataLayout::NHWC;
-    bool       fuse_activation = true;
-    bool       use_post_ops    = conv2d_info.post_ops.size() > 0;
+    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
+    bool               fuse_activation    = true;
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!skip_im2col
-                                    && conv2d_info.post_ops.size() > 0,
-                                    "ClGemmConv2d does not support post ops with col2im or im2col operation"); // Post ops must be performed after every other op
 
     // Validate biases
     if(biases != nullptr)
@@ -520,8 +509,7 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info,
-                                            conv2d_info.post_ops));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
 
     // Validate Col2Im
     if(!skip_col2im)
@@ -530,8 +518,7 @@
     }
 
     // Validate Activation Layer
-    // Disable running (thus validation) of activation kernel if post ops are used
-    if(!fuse_activation && !use_post_ops)
+    if(!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
     }
@@ -600,8 +587,7 @@
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
-    // Disable running of activation kernel if post ops are used
-    if(!_fuse_activation && !_use_post_ops)
+    if(!_fuse_activation)
     {
         ITensorPack pack =
         {
@@ -620,7 +606,7 @@
         ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
         CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
         auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack =
+        ITensorPack        pack    =
         {
             { TensorType::ACL_SRC, weights },
             { TensorType::ACL_DST, weights_reshaped.get() }

diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index afde7c5..8a46ee2 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
-#define ARM_COMPUTE_CL_GEMM_CONV2D_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -113,8 +112,8 @@
                            const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -133,7 +132,7 @@
      */
     void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
                       const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -150,7 +149,7 @@
      * @return a status
      */
     static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info, const experimental::PostOpList<ITensorInfo *> &post_ops = experimental::PostOpList<ITensorInfo *> {});
+                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
 
     enum AuxTensorIdx
     {
@@ -178,10 +177,9 @@
     bool _fuse_activation;
     bool _append_bias;
     bool _is_prepared;
-    bool _use_post_ops;
 
     experimental::MemoryRequirements _aux_mem;
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H

diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 85d24b4..073ffd4 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -131,14 +131,6 @@
     add_convolution_layer_method<FusedConvolutionBatchNormalizationNode>(_layer_data, n);
 }
 
-void DataLayerVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    _layer_data.clear();
-    add_generic_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-    add_convolution_layer_data<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-    add_convolution_layer_method<FusedConvolutionBatchNormalizationWithPostOpsNode>(_layer_data, n);
-}
-
 void DataLayerVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     _layer_data.clear();

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index e5b4add..70fe44e 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018,2021 Arm Limited.
+ * Copyright (c) 2018,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@
 INode::INode()
     : _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
       _outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
-      ,_post_op_info_list(std::list<std::unique_ptr<ConvPostOpInfo>> {})
 {
 }
 // clang-format on
@@ -200,15 +199,5 @@
 {
     return _assigned_target;
 }
-
-const std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list() const
-{
-    return _post_op_info_list;
-}
-
-std::list<std::unique_ptr<ConvPostOpInfo>> &INode::post_op_info_list()
-{
-    return _post_op_info_list;
-}
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index f067d61..5369f6f 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,14 +85,6 @@
 {
     default_visit(n);
 }
-void DefaultNodeVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    default_visit(n);
-}
-void DefaultNodeVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
-    default_visit(n);
-}
 void DefaultNodeVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     default_visit(n);

diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c67f6a5..8828104 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -274,8 +274,6 @@
             return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
             return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
-        case NodeType::FusedConvolutionWithPostOp:
-            return detail::create_fused_convolution_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
             return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
@@ -318,8 +316,6 @@
             return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
             return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
-        case NodeType::FusedConvolutionBatchNormalizationLayerWithPostOpsLayer:
-            return detail::create_fused_convolution_batch_normalization_with_post_op<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationWithPostOpsNode *>(node), ctx);
         default:
             return nullptr;
     }

diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index c50782d..8fd8c14 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,8 +76,6 @@
                    CLDirectConvolutionLayer,
                    CLGEMMConvolutionLayer,
                    CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
-        case NodeType::FusedConvolutionWithPostOp:
-            return detail::validate_fused_convolution_with_post_op<CLGEMMConvolutionLayer>(*polymorphic_downcast<FusedConvolutionWithPostOpNode *>(node));
         case NodeType::DepthToSpaceLayer:
             return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:

diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 8eb3e4c..38284b9 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp

@@ -29,8 +29,6 @@
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
 #include "src/graph/mutators/MutatorUtils.h"
@@ -333,441 +331,6 @@
     }
 }
 
-/** Check valid combinations:
- *
- * | Main operator | Post operators             |
- * |:--------------|:---------------------------|
- * |conv           | add                        |
- * |conv           | act + add                  |
- * |conv           | add + act                  |
- * |conv           | act + add + act            |
- *
-*/
-#define MAX_VALIDE_COMBINATION 4
-#define MAX_POST_OP_NUM 3
-NodeType valide_post_op_type[MAX_VALIDE_COMBINATION][MAX_POST_OP_NUM] = { { EltwiseLayerNode::node_type },
-    { EltwiseLayerNode::node_type, ActivationLayerNode::node_type },
-    { ActivationLayerNode::node_type, EltwiseLayerNode::node_type },
-    { ActivationLayerNode::node_type, EltwiseLayerNode::node_type, ActivationLayerNode::node_type }
-};
-
-bool check_post_op_type(NodeType *post_op_type, int len)
-{
-    if(len > MAX_POST_OP_NUM || len <= 0)
-    {
-        return false;
-    }
-
-    bool found = false;
-    for(int i = 0; i < MAX_VALIDE_COMBINATION; ++i)
-    {
-        for(int j = 0; j < len; ++j)
-        {
-            if(post_op_type[j] != valide_post_op_type[i][j])
-            {
-                found = false;
-                break;
-            }
-            found = true;
-        }
-        if(found)
-            break;
-    }
-
-    return found;
-}
-
-void fuse_convolution_with_post_op(Graph &g, INode *fused_node, std::list<INode *> post_op_node_list, int prev_op_dst_pos)
-{
-    unsigned int op_idx = 0;
-    // Fuse post operators with conv
-    for(const auto &post_op : post_op_node_list)
-    {
-        switch(post_op->type())
-        {
-            case EltwiseLayerNode::node_type:
-            {
-                auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op);
-                ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-
-                fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoEltwiseAdd>(prev_op_dst_pos, eltwise_node->convert_policy()));
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Elementwise Layer node with ID : " << post_op->id());
-                break;
-            }
-            case ActivationLayerNode::node_type:
-            {
-                auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op);
-                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-
-                fused_node->post_op_info_list().push_back(std::make_unique<ConvPostOpInfoActivation>(act_node->activation_info()));
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE(" with Activation Layer node with ID : " << post_op->id());
-                break;
-            }
-            default:
-            {
-                break;
-            }
-        }
-
-        if(op_idx == post_op_node_list.size() - 1) // last fusable node
-        {
-            transfer_driving_nodes_and_remove_old_node(g, fused_node, post_op, true);
-        }
-        else
-        {
-            // Remove node
-            g.remove_node(post_op->id());
-        }
-        op_idx++;
-    }
-}
-
-std::list<INode *> get_post_op_list(Graph &g, int &eltwise_operand_id, int &prev_op_dst_pos, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    std::list<INode *> post_op_node_list    = {};
-    NodeID             prev_op_dst_id       = conv_node_id;
-    NodeType           post_op_type_list[3] = { NodeType::Dummy, NodeType::Dummy, NodeType::Dummy };
-    int                post_op_idx          = 0;
-
-    // Get list of the connected nodes
-    auto current_node = g.node(conv_node_id);
-
-    while(post_op_node_list.size() < 3)
-    {
-        // This convolution node must have only one output edge, otherwise this function would not have been called
-
-        auto current_output_edge_id = current_node->output_edges().begin();
-        auto current_output_edge    = g.edge(*current_output_edge_id);
-        auto post_op_node           = current_output_edge->consumer();
-
-        bool fusable_post_op = false;
-        if(post_op_node != nullptr && post_op_node->output_edges().size() > 0)
-        {
-            switch(post_op_node->type())
-            {
-                case EltwiseLayerNode::node_type:
-                {
-                    auto *eltwise_node = arm_compute::utils::cast::polymorphic_downcast<EltwiseLayerNode *>(post_op_node);
-                    ARM_COMPUTE_ERROR_ON(eltwise_node->output(0) == nullptr);
-                    if(eltwise_node->output(0)->accessor() == nullptr)
-                    {
-                        post_op_node_list.push_back(post_op_node);
-                        fusable_post_op                  = true;
-                        post_op_type_list[post_op_idx++] = eltwise_node->type();
-
-                        // Extract elementwise inputs
-                        const auto eltwise_input_id_0 = eltwise_node->input_edge(0)->producer_id();
-                        const auto eltwise_input_id_1 = eltwise_node->input_edge(1)->producer_id();
-                        if(eltwise_input_id_0 == prev_op_dst_id)
-                        {
-                            eltwise_operand_id = eltwise_input_id_1;
-                            prev_op_dst_pos    = 0;
-                        }
-                        else if(eltwise_input_id_1 == prev_op_dst_id)
-                        {
-                            eltwise_operand_id = eltwise_input_id_0;
-                            prev_op_dst_pos    = 1;
-                        }
-                    }
-                    else
-                    {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with elementwise due to the presence of an output accessor\n");
-                    }
-                    break;
-                }
-                case ActivationLayerNode::node_type:
-                {
-                    auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(post_op_node);
-                    ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr);
-                    // Check if activation is supported for fusion
-                    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
-                    {
-                        break;
-                    }
-                    if(act_node->output(0)->accessor() == nullptr)
-                    {
-                        post_op_node_list.push_back(post_op_node);
-                        fusable_post_op                  = true;
-                        post_op_type_list[post_op_idx++] = act_node->type();
-                        prev_op_dst_id                   = act_node->id();
-                    }
-                    else
-                    {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-                    }
-                    break;
-                }
-                default:
-                {
-                    break;
-                }
-            }
-
-            // Check if the node is not a branching node and current node is fusable
-            if(post_op_node->output_edges().size() == 1 && fusable_post_op == true)
-            {
-                current_node = post_op_node;
-            }
-            else
-            {
-                break;
-            }
-        }
-    }
-
-    // Check whether it's valid post op list
-    if(post_op_node_list.size() > 0)
-    {
-        bool fuse_with_post_op = check_post_op_type(post_op_type_list, post_op_node_list.size());
-        if(!fuse_with_post_op)
-        {
-            post_op_node_list.clear();
-        }
-    }
-
-    return post_op_node_list;
-}
-
-/** Fuse below operators:
- *
- * | Main operator | Post operators             |
- * |:--------------|:---------------------------|
- * |conv           | add                        |
- * |conv           | act + add                  |
- * |conv           | add + act                  |
- * |conv           | act + add + act            |
- *
- * Notes: currently, only GEMM supports fusion with post operator
-*/
-void fuse_convolution_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-
-    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
-    if(conv_algorithm != ConvolutionMethod::GEMM)
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-        return;
-    }
-
-    // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
-    {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
-        const Edge *input_edge = conv_node->input_edge(1);
-        if(input_edge != nullptr && input_edge->tensor() != nullptr)
-        {
-            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
-            const DataType    data_type    = input_edge->tensor()->desc().data_type;
-            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
-            {
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-                return;
-            }
-        }
-        else
-        {
-            return;
-        }
-
-        // Get post op list
-        int                eltwise_operand_id = 0;
-        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
-        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
-        if(post_op_node_list.size() == 0)
-        {
-            return;
-        }
-        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
-        {
-            const Target assigned_target = conv_node->assigned_target();
-
-            // Extract conv inputs
-            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
-            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
-            const auto   conv_info       = conv_node->convolution_info();
-            const auto   conv_method     = conv_node->convolution_method();
-            const auto   num_groups      = conv_node->num_groups();
-            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
-
-            // Create the fused node
-            const NodeID fused_id = g.add_node<FusedConvolutionWithPostOpNode>(conv_info, num_groups, conv_method, fast_math_hint);
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << conv_node->id());
-
-            // Add connections from the conv inputs to the fused node
-            g.add_connection(conv_input_id, 0, fused_id, 0);
-            g.add_connection(conv_weights_id, 0, fused_id, 1);
-            if(conv_node->input_edge(2) != nullptr)
-            {
-                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
-                g.add_connection(conv_bias_id, 0, fused_id, 2);
-            }
-            // Adding the Element wise operand in case the post op is element wise operation
-            auto it = std::find_if(post_op_node_list.begin(),
-                                   post_op_node_list.end(),
-                                   [&](const INode * nd)
-            {
-                return (nd->type() == graph::NodeType::EltwiseLayer);
-            });
-
-            if(it != post_op_node_list.end())
-            {
-                g.add_connection(eltwise_operand_id, 0, fused_id, 3);
-            }
-            g.remove_node(conv_node->id());
-
-            // Update fused node outputs
-            auto fused_node = g.node(fused_id);
-            fused_node->set_assigned_target(assigned_target);
-
-            // Fuse convolution with post op
-            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
-            post_op_node_list.clear();
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-    }
-}
-
-void fuse_convolution_batch_normalization_with_post_ops(Graph &g, const Edge *output_edge, unsigned int conv_node_id, const std::set<Activation> &supported_fused_activations)
-{
-    ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
-
-    auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(output_edge->producer());
-    ARM_COMPUTE_ERROR_ON(conv_node->output(0) == nullptr);
-    const ConvolutionMethod conv_algorithm = conv_node->convolution_method();
-    if(conv_algorithm != ConvolutionMethod::GEMM)
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-        return;
-    }
-
-    // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
-    {
-        // If data type is FP32/FP16, data layout is NHWC, and filter size is 1x1, fuse convolution with post op, as Conv1x1 always leads to GEMM.
-        const Edge *input_edge = conv_node->input_edge(1);
-        if(input_edge != nullptr && input_edge->tensor() != nullptr)
-        {
-            const DataLayout  data_layout  = input_edge->tensor()->desc().layout;
-            const DataType    data_type    = input_edge->tensor()->desc().data_type;
-            const TensorShape tensor_shape = input_edge->tensor()->desc().shape;
-            if((data_layout != DataLayout::NHWC) || (is_data_type_float(data_type) == false) || (tensor_shape.y() != 1) || (tensor_shape.z() != 1))
-            {
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to non GEMM convolution\n");
-                return;
-            }
-        }
-        else
-        {
-            return;
-        }
-
-        // Get post op list
-        int                eltwise_operand_id = 0;
-        int                prev_op_dst_pos    = 0; // Previous operator dst's postion in current operator
-        std::list<INode *> post_op_node_list  = get_post_op_list(g, eltwise_operand_id, prev_op_dst_pos, conv_node_id, supported_fused_activations);
-
-        if(post_op_node_list.size() == 0)
-        {
-            return;
-        }
-        else // Do convolution fusion with post op if there're one(elementwise), two or more operators
-        {
-            const Target assigned_target = conv_node->assigned_target();
-
-            // Extract conv inputs
-            const auto   conv_input_id   = conv_node->input_edge(0)->producer_id();
-            const auto   conv_weights_id = conv_node->input_edge(1)->producer_id();
-            const auto   bn_mean_id      = conv_node->input_edge(3)->producer_id();
-            const auto   bn_var_id       = conv_node->input_edge(4)->producer_id();
-            const auto   conv_info       = conv_node->convolution_info();
-            const auto   conv_method     = conv_node->convolution_method();
-            const auto   num_groups      = conv_node->num_groups();
-            FastMathHint fast_math_hint  = conv_node->fast_math_hint();
-
-            // Create the fused node
-
-            const float  epsilon  = conv_node->epsilon();
-            const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationWithPostOpsNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint);
-
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing FusedConvolutionBatchNormalization node with ID : " << conv_node->id());
-
-            // Add connections from the conv inputs to the fused node
-            g.add_connection(conv_input_id, 0, fused_id, 0);
-            g.add_connection(conv_weights_id, 0, fused_id, 1);
-
-            if(conv_node->input_edge(2) != nullptr)
-            {
-                auto conv_bias_id = conv_node->input_edge(2)->producer_id();
-                g.add_connection(conv_bias_id, 0, fused_id, 2);
-            }
-            g.add_connection(bn_mean_id, 0, fused_id, 3);
-            g.add_connection(bn_var_id, 0, fused_id, 4);
-
-            // Move connections of old FusedConvolutionBatchNormalization to the fused node
-            if(conv_node->input_edge(5) != nullptr)
-            {
-                const auto bn_beta_id = conv_node->input_edge(5)->producer_id();
-                g.add_connection(bn_beta_id, 0, fused_id, 5);
-            }
-
-            if(conv_node->input_edge(6) != nullptr)
-            {
-                const auto bn_gamma_id = conv_node->input_edge(6)->producer_id();
-                g.add_connection(bn_gamma_id, 0, fused_id, 6);
-            }
-
-            // Adding the Element wise operand in case the post op is element wise operation
-            auto it = std::find_if(post_op_node_list.begin(),
-                                   post_op_node_list.end(),
-                                   [&](const INode * nd)
-            {
-                return (nd->type() == graph::NodeType::EltwiseLayer);
-            });
-
-            if(it != post_op_node_list.end())
-            {
-                g.add_connection(eltwise_operand_id, 0, fused_id, 7);
-            }
-
-            // Update fused node outputs
-            auto fused_node = g.node(fused_id);
-            fused_node->set_assigned_target(assigned_target);
-
-            auto conv_node_name = conv_node->name();
-
-            // collect the post ops names
-            std::string post_ops_name = "";
-            for(auto &post_op : post_op_node_list)
-            {
-                post_ops_name += post_op->name();
-            }
-            fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + post_ops_name, assigned_target });
-
-            // Fuse convolution with post op
-            fuse_convolution_with_post_op(g, fused_node, post_op_node_list, prev_op_dst_pos);
-
-            post_op_node_list.clear();
-            g.remove_node(conv_node->id());
-            ARM_COMPUTE_LOG_GRAPH_VERBOSE(std::endl);
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution node with post ops due to the presence of an output accessor\n");
-    }
-}
-
 template <typename N1, typename F, typename... Args>
 void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
 {
@@ -839,10 +402,6 @@
 
     detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
     detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
-    // The fusion of PostOps to ConvolutionLayer:
-    // It must occur after the fusion of PadLayer into ConvolutionLayer
-    // It must occur before the fusion of normal ActivationLayer into ConvolutionLayer as it takes precedence
-    detail::fuse_layer<ConvolutionLayerNode>(g, cl_target_prec, detail::fuse_convolution_with_post_ops, supported_fused_activations);
     detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
     detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
@@ -851,7 +410,6 @@
     // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
     detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
-    detail::fuse_layer<FusedConvolutionBatchNormalizationNode>(g, cl_target_prec, detail::fuse_convolution_batch_normalization_with_post_ops, supported_fused_activations);
 }
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
deleted file mode 100644
index af81f03..0000000
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.cpp
+++ /dev/null

@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationWithPostOpsNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionBatchNormalizationWithPostOpsNode::FusedConvolutionBatchNormalizationWithPostOpsNode(float epsilon, PadStrideInfo info,
-                                                                                                     unsigned int      num_groups,
-                                                                                                     ConvolutionMethod method,
-                                                                                                     FastMathHint      fast_math_hint)
-    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint)
-{
-    _input_edges.resize(8, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_convolution_method(ConvolutionMethod method)
-{
-    _method = method;
-}
-
-float FusedConvolutionBatchNormalizationWithPostOpsNode::epsilon() const
-{
-    return _epsilon;
-}
-
-ConvolutionMethod FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_method() const
-{
-    return _method;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::set_fast_math_hint(FastMathHint hint)
-{
-    _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionBatchNormalizationWithPostOpsNode::fast_math_hint() const
-{
-    return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionBatchNormalizationWithPostOpsNode::convolution_info() const
-{
-    return _info;
-}
-
-unsigned int FusedConvolutionBatchNormalizationWithPostOpsNode::num_groups() const
-{
-    return _num_groups;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                              const TensorDescriptor &weights_descriptor,
-                                                                                              const PadStrideInfo    &info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
-    const DataLayout data_layout       = input_descriptor.layout;
-    TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
-    return output_descriptor;
-}
-
-bool FusedConvolutionBatchNormalizationWithPostOpsNode::forward_descriptors()
-{
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor FusedConvolutionBatchNormalizationWithPostOpsNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    const Tensor *src     = input(0);
-    const Tensor *weights = input(1);
-
-    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-
-    return output_info;
-}
-
-NodeType FusedConvolutionBatchNormalizationWithPostOpsNode::type() const
-{
-    return FusedConvolutionBatchNormalizationWithPostOpsNode::node_type;
-}
-
-void FusedConvolutionBatchNormalizationWithPostOpsNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute

diff --git a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp b/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
deleted file mode 100644
index 63341e2..0000000
--- a/src/graph/nodes/FusedConvolutionWithPostOpNode.cpp
+++ /dev/null

@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FusedConvolutionWithPostOpNode.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INodeVisitor.h"
-#include "arm_compute/graph/Utils.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-FusedConvolutionWithPostOpNode::FusedConvolutionWithPostOpNode(PadStrideInfo     info,
-                                                               unsigned int      num_groups,
-                                                               ConvolutionMethod method,
-                                                               FastMathHint      fast_math_hint,
-                                                               QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
-{
-    _input_edges.resize(4, EmptyEdgeID);
-    _outputs.resize(1, NullTensorID);
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_method(ConvolutionMethod method)
-{
-    _method = method;
-}
-
-ConvolutionMethod FusedConvolutionWithPostOpNode::convolution_method() const
-{
-    return _method;
-}
-
-void FusedConvolutionWithPostOpNode::set_fast_math_hint(FastMathHint hint)
-{
-    _fast_math_hint = hint;
-}
-
-FastMathHint FusedConvolutionWithPostOpNode::fast_math_hint() const
-{
-    return _fast_math_hint;
-}
-
-PadStrideInfo FusedConvolutionWithPostOpNode::convolution_info() const
-{
-    return _info;
-}
-
-unsigned int FusedConvolutionWithPostOpNode::num_groups() const
-{
-    return _num_groups;
-}
-
-ActivationLayerInfo FusedConvolutionWithPostOpNode::fused_activation() const
-{
-    return _fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_fused_activation(ActivationLayerInfo fused_activation)
-{
-    _fused_activation = fused_activation;
-}
-
-void FusedConvolutionWithPostOpNode::set_convolution_info(PadStrideInfo info)
-{
-    _info = info;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                           const TensorDescriptor &weights_descriptor,
-                                                                           const PadStrideInfo    &info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
-    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
-    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
-
-    const DataLayout data_layout       = input_descriptor.layout;
-    TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
-
-    return output_descriptor;
-}
-
-bool FusedConvolutionWithPostOpNode::forward_descriptors()
-{
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
-    {
-        Tensor *dst = output(0);
-        ARM_COMPUTE_ERROR_ON(dst == nullptr);
-        dst->desc() = configure_output(0);
-        return true;
-    }
-    return false;
-}
-
-TensorDescriptor FusedConvolutionWithPostOpNode::configure_output(size_t idx) const
-{
-    ARM_COMPUTE_UNUSED(idx);
-    const Tensor *src     = input(0);
-    const Tensor *weights = input(1);
-
-    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
-
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-    if(!_out_quant_info.empty())
-    {
-        output_info.quant_info = _out_quant_info;
-    }
-
-    return output_info;
-}
-
-NodeType FusedConvolutionWithPostOpNode::type() const
-{
-    return FusedConvolutionWithPostOpNode::node_type;
-}
-
-void FusedConvolutionWithPostOpNode::accept(INodeVisitor &v)
-{
-    v.visit(*this);
-}
-} // namespace graph
-} // namespace arm_compute

diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 1071d50..9c7c424 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,22 +85,6 @@
     _info = ss.str();
 }
 
-void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationWithPostOpsNode &n)
-{
-    ARM_COMPUTE_UNUSED(n);
-    std::stringstream ss;
-    ss << "FusedConvolutionBatchNormalizationWithPostOpsNode";
-    _info = ss.str();
-}
-
-void DotGraphVisitor::visit(FusedConvolutionWithPostOpNode &n)
-{
-    ARM_COMPUTE_UNUSED(n);
-    std::stringstream ss;
-    ss << "FusedConvolutionWithPostOpNode";
-    _info = ss.str();
-}
-
 void DotGraphVisitor::visit(FusedDepthwiseConvolutionBatchNormalizationNode &n)
 {
     ARM_COMPUTE_UNUSED(n);

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 476bf27..f3c05ad 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -29,7 +29,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConv2d.h"
 
@@ -61,26 +60,21 @@
 CLConvolutionLayer::~CLConvolutionLayer() = default;
 
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 }
 
 void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                                    const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
                                                             enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
-    // Convert post op arguments to ITensorInfo
-    auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
-    {
-        return tensor->info();
-    });
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, transformed_post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
                                                     weights_info, CLScheduler::get().target()))
@@ -97,7 +91,6 @@
         }
         case ConvolutionMethod::FFT:
         {
-            ARM_COMPUTE_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
             auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
             f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _impl->func = std::move(f);
@@ -110,31 +103,23 @@
 
     if(_impl->op)
     {
-        _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
-        _impl->aux_mem_req          = _impl->op->workspace();
-        _impl->run_pack             = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        size_t post_op_tensor_index = 0;
-        for(const auto &op : post_ops.get_list())
-        {
-            for(auto &tensor : op->arguments())
-            {
-                _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
-            }
-        }
-        _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->aux_mem_req  = _impl->op->workspace();
+        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
+        _impl->workspace    = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget  gpu_target  = CLScheduler::get().target();
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups, post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
@@ -149,7 +134,6 @@
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(post_ops.size() > 0, "CLFFTConvolutionLayer does not support post ops");
             ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
             break;
         }

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ad5bfd8..c8c18f3 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
@@ -69,24 +68,19 @@
 CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
 
 void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups, post_ops);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
 }
 
 void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                        const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ICLTensor *> &post_ops)
+                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    _impl->weights = weights;
-    _impl->op      = std::make_unique<opencl::ClGemmConv2d>();
-    // Convert post op arguments to ITensorInfo
-    auto transformed_post_ops = experimental::transform_post_op_list_arguments<ICLTensor *, ITensorInfo *>(post_ops, [](auto tensor)
-    {
-        return tensor->info();
-    });
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, transformed_post_ops);
+    _impl->weights               = weights;
+    _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
 
     _impl->run_pack =
@@ -96,15 +90,6 @@
         { TensorType::ACL_SRC_2, biases },
         { TensorType::ACL_DST, output }
     };
-    // Add post op tensors
-    size_t post_op_tensor_index = 0;
-    for(const auto &op : post_ops.get_list())
-    {
-        for(auto &tensor : op->arguments())
-        {
-            _impl->run_pack.add_const_tensor(experimental::get_post_op_arg_type(post_op_tensor_index++), *tensor);
-        }
-    }
     _impl->prep_pack =
     {
         { TensorType::ACL_SRC_1, weights },
@@ -115,9 +100,9 @@
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups, const experimental::PostOpList<ITensorInfo *> &post_ops)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
-    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups, post_ops);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
 }
 

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c4b12e7..3f22235 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt

@@ -67,7 +67,6 @@
           validation/reference/L2NormalizeLayer.cpp
           validation/reference/ActivationLayer.cpp
           validation/reference/SpaceToBatch.cpp
-          validation/reference/PostOps.cpp
           validation/reference/Im2Col.cpp
           validation/reference/DequantizationLayer.cpp
           validation/reference/DeconvolutionLayer.cpp

diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index bced540..986d767 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp

@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -69,53 +68,30 @@
 const auto CNNDataTypes = framework::dataset::make("DataType",
 {
     DataType::F16,
-    DataType::F32,
-    DataType::QASYMM8,
-    DataType::QASYMM8_SIGNED,
+             DataType::F32,
+             DataType::QASYMM8,
+             DataType::QASYMM8_SIGNED,
 });
 
 /** Grouped CNN data types */
 const auto GroupedCNNDataTypes = framework::dataset::make("DataType",
 {
     DataType::F16,
-    DataType::F32
+             DataType::F32
 });
 
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset      = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
 });
 const auto ActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
 });
-
-bool is_post_op_list_valid_in_gemmconv(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &output_shape, DataType data_type, DataLayout data_layout,
-                                       const PadStrideInfo &conv_info, const experimental::PostOpList<ITensorInfo *> &post_ops)
-{
-    const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const auto         dilation   = Size2D(1U, 1U);
-    const unsigned int num_groups = 1U;
-
-    TensorInfo input_info(input_shape, 1, data_type, data_layout);
-    TensorInfo weights_info(weights_shape, 1, data_type, data_layout);
-
-    TensorInfo output_info(output_shape, 1, data_type, data_layout);
-
-    WeightsInfo w_info(false, weights_info.dimension(idx_width), weights_info.dimension(idx_height), weights_info.dimension(idx_kernels));
-
-    const auto status = CLGEMMConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true),
-                                                         &weights_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true),
-                                                         conv_info, w_info, dilation, ActivationLayerInfo(), num_groups, post_ops);
-    return bool(status);
-}
 } // namespace
 
 TEST_SUITE(CL)
@@ -207,72 +183,6 @@
                                                                             enable_fast_math);
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
-
-DATA_TEST_CASE(ValidatePostOpSupportInConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 17U, 31U), 1, DataType::F32, DataLayout::NHWC),            // Select GEMM
-                                                                                  TensorInfo(TensorShape(17U, 31U, 32U), 1, DataType::F32, DataLayout::NCHW),           // Select WINOGRAD
-                                                                                  TensorInfo(TensorShape(27U, 27U, 48U), 1, DataType::F32, DataLayout::NCHW),           // Select Direct
-                                                                                  TensorInfo(TensorShape(27U, 27U, 48U), 1, DataType::F32, DataLayout::NCHW),           // Select FFT
-                                          }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(2U, 1U, 1U, 19U), 1, DataType::F32, DataLayout::NHWC),
-                                                                                    TensorInfo(TensorShape(5U, 5U, 32U, 19U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                    TensorInfo(TensorShape(5U, 5U, 48U, 128U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                    TensorInfo(TensorShape(11U, 11U, 48U, 24), 1, DataType::F32, DataLayout::NCHW),
-                                          })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(19U, 17U, 31U), 1, DataType::F32, DataLayout::NHWC),
-                                                                                   TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                   TensorInfo(TensorShape(27U, 27U, 128U), 1, DataType::F32, DataLayout::NCHW),
-                                                                                   TensorInfo(TensorShape(27U, 27U, 24U), 1, DataType::F32, DataLayout::NCHW),
-                                          })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1U, 1U, 0U, 0U),
-                                                                                 PadStrideInfo(1U, 1U, 2U, 2U),
-                                                                                 PadStrideInfo(1U, 1U, 2U, 2U),
-                                                                                 PadStrideInfo(1U, 1U, 5U, 5U),
-                                          })),
-                                         framework::dataset::make("EnableFastMath", { false, true, false, false})),
-                                         framework::dataset::make("ExpectedMethod",{ ConvolutionMethod::GEMM,
-                                                                                     ConvolutionMethod::WINOGRAD,
-                                                                                     ConvolutionMethod::DIRECT,
-                                                                                     ConvolutionMethod::FFT,
-                                         })),
-                                         framework::dataset::make("PostOpSupported",{ true, false, false, false
-                                         })),
-                                         input_info, weights_info, output_info, conv_info, enable_fast_math, expected_method, post_op_supported)
-{
-    const int idx_width  = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::HEIGHT);
-    const int idx_kernels = get_data_layout_dimension_index(input_info.data_layout(), DataLayoutDimension::BATCHES);
-
-    const auto dilation = Size2D(1U, 1U);
-    const unsigned int num_groups = 1U;
-
-    WeightsInfo w_info(false, weights_info.dimension(idx_width), weights_info.dimension(idx_height), weights_info.dimension(idx_kernels));
-
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<ITensorInfo*>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-
-    ConvolutionMethod actual_method = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
-                                                                            &weights_info.clone()->set_is_resizable(true),
-                                                                            &output_info.clone()->set_is_resizable(true), conv_info,
-                                                                            WeightsInfo(),
-                                                                            ActivationLayerInfo(),
-                                                                            GPUTarget::BIFROST,
-                                                                            dilation,
-                                                                            enable_fast_math);
-    ARM_COMPUTE_EXPECT(actual_method == expected_method, framework::LogLevel::ERRORS);
-    const auto is_valid = CLConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true),
-                                                                            &weights_info.clone()->set_is_resizable(true),
-                                                                            nullptr,
-                                                                            &output_info.clone()->set_is_resizable(true),
-                                                                            conv_info,
-                                                                            w_info,
-                                                                            dilation,
-                                                                            ActivationLayerInfo(),
-                                                                            enable_fast_math,
-                                                                            num_groups,
-                                                                            post_ops);
-    ARM_COMPUTE_EXPECT( bool(is_valid) == post_op_supported, framework::LogLevel::ERRORS);
-}
 // clang-format on
 // *INDENT-ON*
 TEST_SUITE_END() // ConvolutionLayer
@@ -285,167 +195,11 @@
 template <typename T>
 using CLConvolutionValidationWithPaddingFixture = ConvolutionValidationWithPaddingFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto              post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          post_op_arg1_info.get(),
-                                                                          0,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyNHWCIsSupported, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NCHW;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(14U, 12U, 16U, 2U);
-    const auto weights_shape = TensorShape(1U, 1U, 16U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyFloatingTypeIsSupported, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::QASYMM8;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyConv1x1Stride1IsSupported_UnsupportedKernelSize, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 3U, 3U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OnlyConv1x1Stride1IsSupported_UnsupportedStride, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(3, 3, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    const TensorShape post_op_arg0_shape(output_shape);
-    TensorInfo        post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(SupportedPostOps, framework::DatasetMode::ALL)
-{
-    const auto data_type     = DataType::F32;
-    const auto data_layout   = DataLayout::NHWC;
-    const auto conv_info     = PadStrideInfo(1, 1, 0, 0);
-    const auto input_shape   = TensorShape(16U, 14U, 12U, 2U);
-    const auto weights_shape = TensorShape(16U, 1U, 1U, 24U);
-
-    const auto output_shape = misc::shape_calculator::compute_deep_convolution_shape(input_shape, data_layout, weights_shape, conv_info);
-
-    TensorShape post_op_arg0_shape(output_shape);
-    post_op_arg0_shape[1] = 1; // Broadcast in "Y" (second) dimension
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-
-    experimental::PostOpList<ITensorInfo *> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(
-                                                                          &post_op_arg_info,
-                                                                          1,
-                                                                          ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid_in_gemmconv(input_shape, weights_shape, output_shape, data_type, data_layout, conv_info, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                            ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -456,10 +210,7 @@
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                             ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -503,16 +254,16 @@
 template <typename T>
 using CLGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, int8_t>;
 
-const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto QuantizedActivationFunctionsDataset      = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
 const auto QuantizedActivationFunctionsSmallDataset = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
 
 TEST_SUITE(Quantized)
@@ -520,8 +271,8 @@
 const auto QuantizationData = framework::dataset::make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
-    QuantizationInfo(0.3f, 3),
-    QuantizationInfo(1.1f, 10),
+                     QuantizationInfo(0.3f, 3),
+                     QuantizationInfo(1.1f, 10),
 });
 TEST_SUITE(QASYMM8)
 
@@ -637,9 +388,7 @@
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                    ActivationFunctionsSmallDataset))
 {
     // Validate output
@@ -661,9 +410,7 @@
 TEST_SUITE(FP16)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMGroupedConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallGroupedConvolutionLayerDataset(),
-                                                                                                                  framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                  framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                   ActivationFunctionsSmallDataset))
 {
     // Validate output

diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index 7f63a03..0ddf437 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,11 +53,6 @@
 template <typename T>
 using CLGEMMMatrixMultiplyNativeFixture = GEMMMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
 
-// Fixture for CLGEMMMatrixMultiplyNative with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyNativeWithPostOpsFixture =
-    GEMMMatrixMultiplyNativeWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
-
 // Fixture for CLGEMMMatrixMultiplyNative3D
 template <typename T>
 using CLGEMMMatrixMultiplyNative3DFixture = GEMMMatrixMultiplyNative3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
@@ -146,105 +141,6 @@
                                     broadcast_bias_values),
                                     framework::dataset::make("Activation", ActivationLayerInfo()));
 
-/** Post Ops */
-using PostOpArgBroadcast =  CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    // post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),  // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
-    post_ops_1(),
-    post_ops_2(),
-    post_ops_3(),
-    post_ops_4(),
-    post_ops_5()
- } );
-
-bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
-    const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
-    const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
-    // Create TensorInfo for post op arguments
-    TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
-    TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
-    TensorInfo input2_info(TensorShape(n), 1, data_type);
-    TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
-    GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
-             false /**< reinterpret the input as 3D */,
-             true  /**< Flag used to broadcast the bias addition */,
-             false /**< wider accumm */,
-             false /**< has pad y */,
-           ActivationLayerInfo::ActivationFunction::IDENTITY,
-             1   /**< Multiplication factor for the width of the 1xW transposed block */,
-             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
-             lhs_info,
-             rhs_info,
-             0  /**< Offset to be added to each element of the matrix A */,
-             0 /**< Offset to be added to each element of the matrix B */,
-             post_ops);
-    return bool(ClGemmMatrixMultiplyNativeKernel::validate(&input0_info.clone()->set_is_resizable(true),
-                                                          &input1_info.clone()->set_is_resizable(true),
-                                                          &input2_info.clone()->set_is_resizable(true),
-                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
-                                                          lhs_info,
-                                                          rhs_info,
-                                                          gemm_info));
-}
-
 /** Configuration test */
 void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
 {
@@ -295,119 +191,6 @@
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMMatrixMultiplyNative)
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg0_shape(n, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        &post_op_arg_info,
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        post_op_arg1_info.get(),
-        0,
-        ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors "widen" the output tensor
-    const auto data_type = DataType::F32;
-    const unsigned int m = 1;
-    const unsigned int n = 18;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg_shape(n, m + 1, batch); // output's Y dimension (m) is "widened", which is not allowed
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(n, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, 1);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
@@ -541,31 +324,6 @@
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   framework::dataset::make("M0", { 4 })),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   framework::dataset::make("alpha", {1.0f} )),
-                                                                   framework::dataset::make("beta", {1.0f} )),
-                                                                   framework::dataset::make("broadcast_bias", { false, true } )),
-                                                                   framework::dataset::make("Activation", { ActivationLayerInfo() })),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-TEST_SUITE_END() //  FusedPostOps
-
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMulipltyNative

diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index cdd8967..b06e4bf 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,21 +61,11 @@
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedFixture = GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
 
-// Fixture for CLGEMMMatrixMultiplyReshaped with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedWithPostOpsFixture =
-    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
-
 // Fixture for CLGEMMMatrixMultiplyReshaped mixed precision
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture =
     GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
 
-// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture =
-    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
-
 // Fixture for CLGEMMMatrixMultiplyReshaped3D
 template <typename T>
 using CLGEMMMatrixMultiplyReshaped3DFixture = GEMMMatrixMultiplyReshaped3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
@@ -184,108 +173,6 @@
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
 
-/** Post Ops */
-using PostOpArgBroadcast =  CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),  // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
-    post_ops_1(),
-    post_ops_2(),
-    post_ops_3(),
-    post_ops_4(),
-    post_ops_5()
- } );
-
-bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
-    const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
-    const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
-    // Create TensorInfo for post op arguments
-    TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
-    TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
-    TensorInfo input2_info(TensorShape(n), 1, data_type);
-    TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
-    const TensorInfo reshaped_input0_info = input0_info.clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(input0_info, lhs_info));
-    const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
-
-    GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
-             false /**< reinterpret the input as 3D */,
-             true  /**< Flag used to broadcast the bias addition */,
-             false /**< wider accumm */,
-             false /**< has pad y */,
-           ActivationLayerInfo::ActivationFunction::IDENTITY,
-             1   /**< Multiplication factor for the width of the 1xW transposed block */,
-             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
-             lhs_info,
-             rhs_info,
-             0  /**< Offset to be added to each element of the matrix A */,
-             0 /**< Offset to be added to each element of the matrix B */,
-             post_ops);
-    return bool(ClGemmMatrixMultiplyReshapedKernel::validate(&reshaped_input0_info.clone()->set_is_resizable(true),
-                                                          &reshaped_input1_info.clone()->set_is_resizable(true),
-                                                          &input2_info.clone()->set_is_resizable(true),
-                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
-                                                          lhs_info,
-                                                          rhs_info,
-                                                          gemm_info));
-}
-
 } // namespace
 
 TEST_SUITE(CL)
@@ -450,119 +337,7 @@
                                                           rhs_info,
                                                           gemm_info)) == expected, framework::LogLevel::ERRORS);
 }
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg0_shape(n, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto post_op_arg1_info = post_op_arg_info.clone();
 
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        &post_op_arg_info,
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        post_op_arg1_info.get(),
-        0,
-        ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors "widen" the output tensor
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg_shape(n + 4, m, batch); // output's X dimension (n) is "widened", which is not allowed
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(n, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, 1);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -697,44 +472,6 @@
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
@@ -1002,44 +739,6 @@
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if validate() is successful
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
@@ -1178,45 +877,6 @@
     }
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
-
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -1483,44 +1143,6 @@
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if validate() is successful
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP16
@@ -1659,45 +1281,6 @@
     }
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   v0_values_precommit),
-                                                                   h0_values_precommit),
-                                                                   framework::dataset::make("interleave_lhs", { false })),
-                                                                   framework::dataset::make("interleave_rhs", { false })),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", { true, false })),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values_precommit),
-                                                                   beta_values_precommit),
-                                                                   framework::dataset::make("broadcast_bias", { true } )),
-                                                                   lhs_transpose_values),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() // FusedPostOps
-
 TEST_SUITE_END() // MixedPrecision
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMultiplyReshaped

diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index 53038c8..dafc8dc 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,11 +61,6 @@
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture = GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
 
-// Fixture for CLGEMMMatrixMultiplyReshapedOnlyRHS with post ops
-template <typename T>
-using CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture =
-    GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
-
 namespace
 {
 // *INDENT-OFF*
@@ -164,106 +158,6 @@
                                     broadcast_bias_values),
                                     framework::dataset::make("Activation", ActivationLayerInfo()));
 
-/** Post Ops */
-using PostOpArgBroadcast =  CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>::PostOpArgBroadcast;
-experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),  // If broadcast in dims 0, 1 and 2
-        1,
-        ConvertPolicy::SATURATE);
-    return post_ops;
-}
-// To test that the output of the main op is the first parameter in prelu post op
-experimental::PostOpList<PostOpArgBroadcast> post_ops_4()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, true),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        0,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param
-experimental::PostOpList<PostOpArgBroadcast> post_ops_5()
-{
-    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
-    post_ops.push_back_op<experimental::PostOpEltwisePRelu<PostOpArgBroadcast>>(
-        std::make_tuple(false, false, false),   // If true, broadcast in corresponding dim: 0, 1 or 2
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
-    return post_ops;
-}
-/** Different Post Op Lists */
-const auto post_op_lists = framework::dataset::make("post_op_lists", {
-    post_ops_1(),
-    post_ops_2(),
-    post_ops_3(),
-    post_ops_4(),
-    post_ops_5()
- } );
-
- bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
-{
-    const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
-    const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
-
-    // Create TensorInfo for post op arguments
-    TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
-    TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
-    TensorInfo input2_info(TensorShape(n), 1, data_type);
-    TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
-
-    const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
-
-    GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
-             false /**< reinterpret the input as 3D */,
-             true  /**< Flag used to broadcast the bias addition */,
-             false /**< wider accumm */,
-             false /**< has pad y */,
-           ActivationLayerInfo::ActivationFunction::IDENTITY,
-             1   /**< Multiplication factor for the width of the 1xW transposed block */,
-             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
-             lhs_info,
-             rhs_info,
-             0  /**< Offset to be added to each element of the matrix A */,
-             0 /**< Offset to be added to each element of the matrix B */,
-             post_ops);
-    return bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(&input0_info.clone()->set_is_resizable(true),
-                                                          &reshaped_input1_info.clone()->set_is_resizable(true),
-                                                          &input2_info.clone()->set_is_resizable(true),
-                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
-                                                          lhs_info,
-                                                          rhs_info,
-                                                          gemm_info));
-}
 /** Configuration test */
 bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
                             unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
@@ -370,119 +264,6 @@
     ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS);
 }
 
-TEST_SUITE(ValidateFusedPostOpsConfigs)
-TEST_SUITE(Invalid)
-TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 13;
-    const unsigned int batch = 2;
-    TensorShape post_op_arg0_shape(n, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
-    auto post_op_arg1_info = post_op_arg_info.clone();
-
-    // Unsupported sequence of post ops
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        &post_op_arg_info,
-        1,
-        ConvertPolicy::SATURATE);
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
-        post_op_arg1_info.get(),
-        0,
-        ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors "widen" the output tensor
-    const auto data_type = DataType::F32;
-    const unsigned int m = 17;
-    const unsigned int n = 1;
-    const unsigned int k = 1;
-    const unsigned int batch = 1;
-    TensorShape post_op_arg_shape(n, m, batch + 4); // output's batch dimension is "widened", which is not allowed
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
-{
-    // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, m, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-TEST_SUITE(Valid)
-TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(n, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, batch);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
-{
-    const auto data_type = DataType::F32;
-    const unsigned int m = 22;
-    const unsigned int n = 16;
-    const unsigned int k = 15;
-    const unsigned int batch = 3;
-    TensorShape post_op_arg_shape(1, 1, 1);
-    TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
-    experimental::PostOpList<ITensorInfo*> post_ops{};
-    post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
-
-    ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Valid
-TEST_SUITE_END() // ValidateFusedPostOps
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -684,43 +465,6 @@
     }
 }
 
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   framework::dataset::make("H0", {1})),
-                                                                   framework::dataset::make("interleave_rhs", { true })),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", {false, true})),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   framework::dataset::make("broadcast_bias", { false } )),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
-
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -849,42 +593,6 @@
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
-TEST_SUITE(FusedPostOps)
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   framework::dataset::make("H0", {1})),
-                                                                   framework::dataset::make("interleave_rhs", { true })),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
-                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   framework::dataset::make("broadcast_bias", { false } )),
-                                                                   act_values),
-                                                                   post_op_lists)
-                                                                   )
-{
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(validate_result)
-    {
-        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
-}
-
-TEST_SUITE_END() //  FusedPostOps
 
 TEST_SUITE_END() // FP16
 

diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index f27a179..bae8cbf 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp

@@ -58,7 +58,7 @@
  * No quantized tests               | Not supported yet
  * No grouped CNN tests             | Not supported yet
  * No mixed layout tests            | Not needed; only NHWC is supported
- * No activation/post op tests      | Not needed in fusion
+ * No activation                    | Not needed in fusion
  * No ValidateConvolutionMethod     | Only a single method (direct conv2d) is supported
  * No ReshapeWeights = true tests   | Not applicable yet. This parameter only concerns gemm-based conv2d
  * No RunSmallWithPadding tests     | Padding is removed
@@ -70,9 +70,7 @@
 using DynamicFusionGpuConv2dFixture = DynamicFusionGpuConv2dValidationFixture<CLTensor, CLAccessor, GpuConv2d, T>;
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                            framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), framework::dataset::make("QuantizationInfo", QuantizationInfo())))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -81,9 +79,7 @@
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                           framework::dataset::make("QuantizationInfo", QuantizationInfo())))
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), framework::dataset::make("QuantizationInfo", QuantizationInfo())))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);

diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index f1e0ee9..afde3d8 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h

@@ -21,14 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMM_FIXTURE
-#define ARM_COMPUTE_TEST_GEMM_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "src/core/experimental/PostOpUtils.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -38,7 +36,6 @@
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/GEMM.h"
-#include "tests/validation/reference/PostOps.h"
 
 #include <random>
 
@@ -304,8 +301,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -426,8 +422,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -580,8 +575,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -734,8 +728,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -908,8 +901,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -960,262 +952,6 @@
     SimpleTensor<T> _reference{};
 };
 
-/** (EXPERIMENTAL_POST_OPS)*/
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
-class GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
-    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-public:
-    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
-               bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info,
-               const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
-    {
-        GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0         = m0;
-        lhs_info.k0         = k0;
-        lhs_info.v0         = v0;
-        lhs_info.interleave = interleave_lhs;
-        lhs_info.transpose  = lhs_transpose;
-
-        GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0                 = n0;
-        rhs_info.k0                 = k0;
-        rhs_info.h0                 = h0;
-        rhs_info.interleave         = interleave_rhs;
-        rhs_info.transpose          = !lhs_transpose;
-        rhs_info.export_to_cl_image = export_to_cl_image;
-
-        // Set the tensor shapes for LHS and RHS matrices
-        const TensorShape lhs_shape(k, m, batch_size);
-        const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n,
-                                     broadcast_bias ? 1 : m,
-                                     broadcast_bias ? 1 : batch_size);
-        auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
-                                                                                                                    [ = ](auto broadcast)
-        {
-            return TensorShape
-            {
-                std::get<0>(broadcast) ? 1 : n,
-                std::get<1>(broadcast) ? 1 : m,
-                std::get<2>(broadcast) ? 1 : batch_size,
-            };
-        });
-
-        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        if(validate_result)
-        {
-            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        }
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(tensor, distribution, i);
-
-        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
-        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
-        library->fill_borders_with_garbage(tensor, distribution_inf, i);
-    }
-
-    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        // Create tensors
-        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
-        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
-        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-
-        // Create post op tensors and populate post op with them
-        std::vector<TensorType> post_op_tensors_holder{};
-        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
-                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
-        {
-            auto t = create_tensor<TensorType>(shape, data_type, 1);
-            post_op_tensors_holder.push_back(std::move(t));
-            return post_op_tensors_holder.back().info();
-        });
-        TensorType lhs_reshaped;
-        TensorType rhs_reshaped;
-        TensorType dst;
-
-        const unsigned int M = lhs_shape[1];
-        const unsigned int N = rhs_shape[0];
-        const unsigned int K = lhs_shape[0];
-        GEMMKernelInfo     kernel_info;
-        kernel_info.m                       = M;
-        kernel_info.n                       = N;
-        kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = 0;
-        kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = broadcast_bias;
-        kernel_info.activation_info         = act_info;
-        kernel_info.fp_mixed_precision      = fp_mixed_precision;
-        kernel_info.post_ops                = populated_post_ops;
-
-        // The output tensor will be auto-initialized within the function
-
-        // Create and configure function
-        ReshapeLHSOperatorType reshape_lhs;
-        ReshapeRHSOperatorType reshape_rhs;
-        GEMMOperatorType       gemm;
-
-        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
-        validate_result = validate_result || !rhs_info.export_to_cl_image;
-        if(!validate_result)
-        {
-            return nullptr;
-        }
-
-        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
-        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
-        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
-        }
-
-        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
-        if(!rhs_info.export_to_cl_image)
-        {
-            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
-            for(auto &tensor : post_op_tensors_holder)
-            {
-                add_padding_x({ &tensor });
-            }
-        }
-
-        // Allocate tensors
-        lhs.allocator()->allocate();
-        rhs.allocator()->allocate();
-        lhs_reshaped.allocator()->allocate();
-        rhs_reshaped.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            tensor.allocator()->allocate();
-        }
-
-        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
-        }
-
-        // Fill tensors
-        fill(AccessorType(lhs), 0);
-        fill(AccessorType(rhs), 1);
-        fill(AccessorType(bias), 2);
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
-        }
-
-        // Compute GEMM
-        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
-        reshape_lhs.run(reshape_lhs_pack);
-        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
-        reshape_rhs.run(reshape_rhs_pack);
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
-        }
-        gemm.run(gemm_pack);
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
-                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        TensorShape dst_shape = lhs_shape;
-        dst_shape[0]          = rhs_shape[0];
-        dst_shape[1]          = lhs_shape[1];
-
-        // Create reference
-        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
-        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
-        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-        // Create post op tensors and populate post op with them
-        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
-        {
-            return SimpleTensor<T> { shape, data_type, 1 };
-        });
-
-        const int n          = rhs_shape[0];
-        const int m          = lhs_shape[1];
-        const int batch_size = lhs_shape[2];
-
-        // Fill reference
-        int tensor_idx = 0;
-        fill(lhs, tensor_idx++);
-        fill(rhs, tensor_idx++);
-        fill(bias, tensor_idx++);
-        for(auto &op : populated_post_ops.get_list())
-        {
-            for(auto tensor : op->arguments())
-            {
-                fill(*tensor, tensor_idx++);
-            }
-        }
-
-        if(broadcast_bias)
-        {
-            // In case of broadcast, we need to simply copy the first into the following "M" ones
-            for(int i = 1; i < m * batch_size; i++)
-            {
-                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-            }
-        }
-
-        SimpleTensor<T> out;
-        if(fp_mixed_precision)
-        {
-            out = reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta);
-        }
-        else
-        {
-            out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
-        }
-        // Ignore activation info if post ops are used instead
-        if(populated_post_ops.size() > 0)
-        {
-            out = reference::post_ops<T>(out, populated_post_ops);
-        }
-        else
-        {
-            out = reference::activation_layer(out, act_info);
-        }
-        return out;
-    }
-
-    bool            validate_result = true;
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
 template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
 class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
 {
@@ -1344,8 +1080,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -1515,8 +1250,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -1560,242 +1294,6 @@
     SimpleTensor<T> _reference{};
 };
 
-/** (EXPERIMENTAL_POST_OPS)*/
-template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
-class GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
-    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
-               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info,
-               const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
-    {
-        GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0 = m0;
-        lhs_info.k0 = k0;
-
-        GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0                 = n0;
-        rhs_info.k0                 = k0;
-        rhs_info.h0                 = h0;
-        rhs_info.interleave         = interleave_rhs;
-        rhs_info.transpose          = transpose_rhs;
-        rhs_info.export_to_cl_image = export_to_cl_image;
-
-        // Set the tensor shapes for LHS and RHS matrices
-        const TensorShape lhs_shape(k, m, batch_size);
-        const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n,
-                                     broadcast_bias ? 1 : m,
-                                     broadcast_bias ? 1 : batch_size);
-        auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
-                                                                                                                    [ = ](auto broadcast)
-        {
-            return TensorShape
-            {
-                std::get<0>(broadcast) ? 1 : n,
-                std::get<1>(broadcast) ? 1 : m,
-                std::get<2>(broadcast) ? 1 : batch_size,
-            };
-        });
-
-        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        if(validate_result)
-        {
-            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        }
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(tensor, distribution, i);
-
-        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
-        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
-        library->fill_borders_with_garbage(tensor, distribution_inf, i);
-    }
-
-    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        // Create tensors
-        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
-        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
-        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-        TensorType rhs_reshaped;
-        TensorType dst;
-        // Create post op tensors and populate post op with them
-        std::vector<TensorType> post_op_tensors_holder{};
-        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
-                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
-        {
-            auto t = create_tensor<TensorType>(shape, data_type, 1);
-            post_op_tensors_holder.push_back(std::move(t));
-            return post_op_tensors_holder.back().info();
-        });
-
-        const unsigned int M = lhs_shape[1];
-        const unsigned int N = rhs_shape[0];
-        const unsigned int K = lhs_shape[0];
-        GEMMKernelInfo     kernel_info;
-        kernel_info.m                       = M;
-        kernel_info.n                       = N;
-        kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = 0;
-        kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = broadcast_bias;
-        kernel_info.activation_info         = act_info;
-        kernel_info.post_ops                = populated_post_ops;
-
-        // The output tensor will be auto-initialized within the function
-
-        // Create and configure function
-        ReshapeRHSOperatorType reshape_rhs;
-        GEMMOperatorType       gemm;
-
-        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
-        validate_result = validate_result || !rhs_info.export_to_cl_image;
-        if(!validate_result)
-        {
-            return nullptr;
-        }
-
-        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
-        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
-        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
-        }
-
-        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
-        if(!rhs_info.export_to_cl_image)
-        {
-            add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
-            for(auto &tensor : post_op_tensors_holder)
-            {
-                add_padding_x({ &tensor });
-            }
-        }
-
-        // Allocate tensors
-        lhs.allocator()->allocate();
-        rhs.allocator()->allocate();
-        rhs_reshaped.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            tensor.allocator()->allocate();
-        }
-
-        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
-        }
-
-        // Fill tensors
-        fill(AccessorType(lhs), 0);
-        fill(AccessorType(rhs), 1);
-        fill(AccessorType(bias), 2);
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
-        }
-
-        // Compute GEMM
-        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
-        reshape_rhs.run(reshape_rhs_pack);
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
-        }
-        gemm.run(gemm_pack);
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
-                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        TensorShape dst_shape = lhs_shape;
-        dst_shape[0]          = rhs_shape[0];
-        dst_shape[1]          = lhs_shape[1];
-
-        // Create reference
-        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
-        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
-        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-        // Create post op tensors and populate post op with them
-        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
-        {
-            return SimpleTensor<T> { shape, data_type, 1 };
-        });
-
-        const int n          = rhs_shape[0];
-        const int m          = lhs_shape[1];
-        const int batch_size = lhs_shape[2];
-
-        // Fill reference
-        int tensor_idx = 0;
-        fill(lhs, tensor_idx++);
-        fill(rhs, tensor_idx++);
-        fill(bias, tensor_idx++);
-        for(auto &op : populated_post_ops.get_list())
-        {
-            for(auto tensor : op->arguments())
-            {
-                fill(*tensor, tensor_idx++);
-            }
-        }
-
-        if(broadcast_bias)
-        {
-            // In case of broadcast, we need to simply copy the first into the following "M" ones
-            for(int i = 1; i < m * batch_size; i++)
-            {
-                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-            }
-        }
-
-        SimpleTensor<T> out;
-        out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
-        // Ignore activation info if post ops are used instead
-        if(populated_post_ops.size() > 0)
-        {
-            out = reference::post_ops<T>(out, populated_post_ops);
-        }
-        else
-        {
-            out = reference::activation_layer(out, act_info);
-        }
-        return out;
-    }
-
-    bool            validate_result = true;
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
 template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
 class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture
 {
@@ -1921,8 +1419,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2057,8 +1554,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2102,212 +1598,6 @@
 };
 
 template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
-class GEMMMatrixMultiplyNativeWithPostOpsValidationFixture : public framework::Fixture
-{
-public:
-    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
-public:
-    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta, bool broadcast_bias,
-               const ActivationLayerInfo &act_info, const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
-    {
-        GEMMLHSMatrixInfo lhs_info;
-        lhs_info.m0 = m0;
-        lhs_info.k0 = k0;
-
-        GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0 = n0;
-        rhs_info.k0 = k0;
-
-        // Set the tensor shapes for LHS and RHS matrices
-        const TensorShape lhs_shape(k, m, batch_size);
-        const TensorShape rhs_shape(n, k, batch_size);
-        const TensorShape bias_shape(n,
-                                     broadcast_bias ? 1 : m,
-                                     broadcast_bias ? 1 : batch_size);
-        const auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
-                                                                                                                          [ = ](auto broadcast)
-        {
-            return TensorShape
-            {
-                std::get<0>(broadcast) ? 1 : n,
-                std::get<1>(broadcast) ? 1 : m,
-                std::get<2>(broadcast) ? 1 : batch_size,
-            };
-        });
-
-        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
-        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
-        DistributionType distribution{ T(-1.0f), T(1.0f) };
-        library->fill(tensor, distribution, i);
-
-        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
-        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
-        library->fill_borders_with_garbage(tensor, distribution_inf, i);
-    }
-
-    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        // Create tensors
-        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
-        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
-        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
-        TensorType dst;
-        // Create post op tensors and populate post op with them
-        std::vector<TensorType> post_op_tensors_holder{};
-        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
-                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
-        {
-            auto t = create_tensor<TensorType>(shape, data_type, 1);
-            post_op_tensors_holder.push_back(std::move(t));
-            return post_op_tensors_holder.back().info();
-        });
-
-        const unsigned int M = lhs_shape[1];
-        const unsigned int N = rhs_shape[0];
-        const unsigned int K = lhs_shape[0];
-        GEMMKernelInfo     kernel_info;
-        kernel_info.m                       = M;
-        kernel_info.n                       = N;
-        kernel_info.k                       = K;
-        kernel_info.depth_output_gemm3d     = 0;
-        kernel_info.reinterpret_input_as_3d = false;
-        kernel_info.broadcast_bias          = broadcast_bias;
-        kernel_info.activation_info         = act_info;
-        kernel_info.post_ops                = populated_post_ops;
-
-        // Create and configure function
-        GEMMOperatorType gemm;
-        gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-
-        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
-        }
-
-        add_padding_x({ &lhs, &rhs, &bias, &dst });
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            add_padding_x({ &tensor });
-        }
-
-        // Allocate tensors
-        lhs.allocator()->allocate();
-        rhs.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-        for(auto &tensor : post_op_tensors_holder)
-        {
-            tensor.allocator()->allocate();
-        }
-
-        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
-        for(const auto &tensor : post_op_tensors_holder)
-        {
-            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
-        }
-
-        // Fill tensors
-        fill(AccessorType(lhs), 0);
-        fill(AccessorType(rhs), 1);
-        fill(AccessorType(bias), 2);
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
-        }
-
-        // Compute GEMM
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
-        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
-        {
-            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
-        }
-        gemm.run(gemm_pack);
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
-                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
-    {
-        TensorShape dst_shape = lhs_shape;
-        dst_shape[0]          = rhs_shape[0];
-        dst_shape[1]          = lhs_shape[1];
-
-        // Create reference
-        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
-        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
-        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
-        // Create post op tensors and populate post op with them
-        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
-        {
-            return SimpleTensor<T> { shape, data_type, 1 };
-        });
-
-        const int n          = rhs_shape[0];
-        const int m          = lhs_shape[1];
-        const int batch_size = lhs_shape[2];
-
-        // Fill reference
-        int tensor_idx = 0;
-        fill(lhs, tensor_idx++);
-        fill(rhs, tensor_idx++);
-        fill(bias, tensor_idx++);
-        for(auto &op : populated_post_ops.get_list())
-        {
-            for(auto tensor : op->arguments())
-            {
-                fill(*tensor, tensor_idx++);
-            }
-        }
-
-        if(broadcast_bias)
-        {
-            // In case of broadcast, we need to simply copy the first into the following "M" ones
-            for(int i = 1; i < m * batch_size; i++)
-            {
-                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
-            }
-        }
-
-        SimpleTensor<T> out;
-        out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
-        // Ignore activation info if post ops are used instead
-        if(populated_post_ops.size() > 0)
-        {
-            out = reference::post_ops<T>(out, populated_post_ops);
-        }
-        else
-        {
-            out = reference::activation_layer(out, act_info);
-        }
-        return out;
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-};
-
-template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiplyNative3DValidationFixture : public framework::Fixture
 {
 public:
@@ -2398,8 +1688,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2557,8 +1846,7 @@
         ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
             { ACL_SRC_1, &rhs_reshaped },
             { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+            { ACL_DST, &dst } });
         gemm.run(gemm_pack);
 
         return dst;
@@ -2608,4 +1896,4 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMFIXTURE_H

diff --git a/tests/validation/reference/PostOps.cpp b/tests/validation/reference/PostOps.cpp
deleted file mode 100644
index ecfed4c..0000000
--- a/tests/validation/reference/PostOps.cpp
+++ /dev/null

@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "PostOps.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/PostOps.h"
-#include "support/Cast.h"
-#include "tests/validation/reference/ActivationLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops)
-{
-    // Create reference
-    SimpleTensor<T> dst{ a };
-
-    for(auto &post_op : post_ops.get_list())
-    {
-        switch(post_op->type())
-        {
-            case experimental::PostOpType::Activation:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<SimpleTensor<T>> *>(post_op.get());
-                dst                 = reference::activation_layer(dst, _post_op->_act_info);
-                break;
-            }
-            case experimental::PostOpType::Eltwise_Add:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<SimpleTensor<T>> *>(post_op.get());
-                dst                 = reference::arithmetic_operation(ArithmeticOperation::ADD, dst, _post_op->_addend, dst, _post_op->_policy);
-                break;
-            }
-            case experimental::PostOpType::Eltwise_PRelu:
-            {
-                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwisePRelu<SimpleTensor<T>> *>(post_op.get());
-
-                // If previous main operation output is the the first pRelu argument, then pass it as src1 parameter of the arithmetic operation
-                if(_post_op->_prev_dst_pos == 0)
-                {
-                    dst = reference::arithmetic_operation(ArithmeticOperation::PRELU, dst, _post_op->_alpha_param, dst, _post_op->_policy);
-                }
-                // If previous main operation output is the the second pRelu argument, then pass it as src2 parameter of the arithmetic operation
-                else if(_post_op->_prev_dst_pos == 1)
-                {
-                    dst = reference::arithmetic_operation(ArithmeticOperation::PRELU, _post_op->_alpha_param, dst, dst, _post_op->_policy);
-                }
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            }
-        }
-    }
-    return dst;
-}
-
-template SimpleTensor<float> post_ops(const SimpleTensor<float> &a, experimental::PostOpList<SimpleTensor<float>> post_ops);
-template SimpleTensor<half> post_ops(const SimpleTensor<half> &a, experimental::PostOpList<SimpleTensor<half>> post_ops);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
\ No newline at end of file

diff --git a/tests/validation/reference/PostOps.h b/tests/validation/reference/PostOps.h
deleted file mode 100644
index 5fe0fe7..0000000
--- a/tests/validation/reference/PostOps.h
+++ /dev/null

@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_POSTOPS_H
-#define ARM_COMPUTE_TEST_POSTOPS_H
-
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-/** (EXPERIMENTAL_POST_OPS) */
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops);
-
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_POSTOPS_H */

diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 4bc326b..69cc3d4 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h

@@ -21,8 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TYPE_PRINTER_H__
-#define __ARM_COMPUTE_TYPE_PRINTER_H__
+
+#ifndef ACL_UTILS_TYPEPRINTER_H
+#define ACL_UTILS_TYPEPRINTER_H
 
 #ifdef ARM_COMPUTE_OPENCL_ENABLED
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -36,8 +37,6 @@
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/IPostOp.h"
-#include "arm_compute/core/experimental/PostOps.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
@@ -150,144 +149,6 @@
     return str.str();
 }
 
-/** @name (EXPERIMENTAL_POST_OPS)
- * @{
- */
-/** Formmated output of the @ref experimental::PostOpType type
- *
- * @param[out] os           Output stream.
- * @param[in]  post_op_type Type to output.
- *
- * @return Modified output stream.
- */
-inline ::std::ostream &operator<<(::std::ostream &os, experimental::PostOpType post_op_type)
-{
-    os << "type=";
-    switch(post_op_type)
-    {
-        case experimental::PostOpType::Activation:
-        {
-            os << "Activation";
-            break;
-        }
-        case experimental::PostOpType::Eltwise_Add:
-        {
-            os << "Eltwise_Add";
-            break;
-        }
-        case experimental::PostOpType::Eltwise_PRelu:
-        {
-            os << "Eltwise_PRelu";
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            break;
-        }
-    }
-    return os;
-}
-/** Converts a @ref experimental::PostOpType to string
- *
- * @param[in] post_op_type PostOpType value to be converted
- *
- * @return String representing the corresponding PostOpType
- */
-inline std::string to_string(experimental::PostOpType post_op_type)
-{
-    std::stringstream str;
-    str << post_op_type;
-    return str.str();
-}
-/** Formatted output of the @ref experimental::IPostOp type.
- *
- * @param[out] os      Output stream.
- * @param[in]  post_op Type to output.
- *
- * @return Modified output stream.
- */
-template <typename T>
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::IPostOp<T> &post_op)
-{
-    os << "<";
-    os << post_op.type() << ",";
-    os << "prev_dst_pos=" << post_op.prev_dst_pos() << ",";
-    switch(post_op.type())
-    {
-        case experimental::PostOpType::Activation:
-        {
-            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<T> *>(&post_op);
-            os << "act_info=" << &(_post_op->_act_info);
-            break;
-        }
-        case experimental::PostOpType::Eltwise_Add:
-        {
-            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<T> *>(&post_op);
-            os << "convert_policy=" << _post_op->_policy;
-            break;
-        }
-        case experimental::PostOpType::Eltwise_PRelu:
-        {
-            const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwisePRelu<T> *>(&post_op);
-            os << "convert_policy=" << _post_op->_policy;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported PostOpType");
-            break;
-        }
-    }
-    os << ">";
-    return os;
-}
-/** Converts an @ref experimental::IPostOp to string
- *
- * @param[in] post_op IPostOp value to be converted
- *
- * @return String representing the corresponding IPostOp
- */
-template <typename T>
-inline std::string to_string(const experimental::IPostOp<T> &post_op)
-{
-    std::stringstream str;
-    str << post_op;
-    return str.str();
-}
-/** Formatted output of the @ref experimental::PostOpList type.
- *
- * @param[out] os       Output stream.
- * @param[in]  post_ops Type to output.
- *
- * @return Modified output stream.
- */
-template <typename T>
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::PostOpList<T> &post_ops)
-{
-    os << "[";
-    for(const auto &post_op : post_ops.get_list())
-    {
-        os << *post_op << ",";
-    }
-    os << "]";
-    return os;
-}
-/** Converts a @ref experimental::PostOpList to string
- *
- * @param[in] post_ops PostOpList value to be converted
- *
- * @return String representing the corresponding PostOpList
- */
-template <typename T>
-inline std::string to_string(const experimental::PostOpList<T> &post_ops)
-{
-    std::stringstream str;
-    str << post_ops;
-    return str.str();
-}
-/** @} */ // end of group (EXPERIMENTAL_POST_OPS)
-
 /** Formatted output of the Dimensions type.
  *
  * @param[out] os         Output stream.
@@ -399,7 +260,6 @@
     os << " mult_interleave4x4_height=" << gemm_info.mult_interleave4x4_height;
     os << " a_offset=" << gemm_info.a_offset;
     os << " b_offset=" << gemm_info.b_offset;
-    os << "post_ops=" << gemm_info.post_ops;
     os << ")";
     return os;
 }
@@ -1563,7 +1423,7 @@
     os << "fp_mixed_precision=" << info.fp_mixed_precision() << ",";
     os << "broadcast_bias=" << info.broadcast_bias() << ",";
     os << "pretranspose_B=" << info.pretranspose_B() << ",";
-    os << "post_ops=" << info.post_ops() << "}";
+    os << "}";
 
     return os;
 }
@@ -2883,7 +2743,7 @@
        << "act_info=" << to_string(conv_info.act_info) << ", "
        << "enable_fast_math=" << conv_info.enable_fast_math << ", "
        << "num_groups=" << conv_info.num_groups << ","
-       << "post_ops=" << conv_info.post_ops << "}";
+       << "}";
     return os;
 }
 
@@ -3772,4 +3632,4 @@
 
 } // namespace arm_compute
 
-#endif /* __ARM_COMPUTE_TYPE_PRINTER_H__ */
+#endif // ACL_UTILS_TYPEPRINTER_H
commit	0d27b2ee8d811d66693555ac1e7be44d93e662e2	[log] [tgz]
author	Jakub Sujak <jakub.sujak@arm.com>	Thu Aug 24 14:01:20 2023 +0100
committer	Jakub Sujak <jakub.sujak@arm.com>	Mon Sep 04 14:41:16 2023 +0000
tree	8b62a464a8bb9cd46702c8b5a60f3a97e3821b41
parent	7ff03b67ba7ce669223f4d807e18fa3efa2f729b [diff]